Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SpGEMM bad_alloc Error #31

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
input matrices size <= TILE_SIZE SpGEMM inner product implementation
Floruaaa666 committed Jun 14, 2024
commit 9b67c749f8a20523cb39551ae1399dda91e434b0
14 changes: 10 additions & 4 deletions include/loops/algorithms/spgemm/estimate_nnz_test.cuh
Original file line number Diff line number Diff line change
@@ -1006,6 +1006,7 @@ __global__ void __estimate_nnz_row_col_pairs_v4(setup_t config,


// Precalculate the column indices of C
/*
template <typename setup_t,
typename index_t,
typename offset_t,
@@ -1205,7 +1206,7 @@ __global__ void __precalculate_c_col_indices(setup_t config,
}
__syncthreads();
}

*/

/**
* @brief Estimate the nnz of output matrix C.
@@ -1307,8 +1308,6 @@ void estimate_nnz_test_v3(csr_t<index_t, offset_t, type_t>& csr,
int* c_nnz_per_tile,
cudaStream_t stream = 0) {


// Create a schedule.
constexpr std::size_t block_size = TILE_SIZE;
// constexpr dim3 block_size(TILE_SIZE, TILE_SIZE, 1);

@@ -1319,7 +1318,11 @@ void estimate_nnz_test_v3(csr_t<index_t, offset_t, type_t>& csr,

// dim3 grid_size((csc.cols + block_size.x - 1) / block_size.x, (csr.rows + block_size.y - 1) / block_size.y, 1);
// dim3 grid_size((csc.cols + block_size.x - 1) / block_size.x, csr.rows, 1);
std::size_t grid_size = (csr.rows + block_size - 1) / block_size;



// std::size_t grid_size = (csr.rows + block_size - 1) / block_size;
std::size_t grid_size = 32;
printf("grid_size: %ld\n", grid_size);


@@ -1334,6 +1337,7 @@ void estimate_nnz_test_v3(csr_t<index_t, offset_t, type_t>& csr,
cudaStreamSynchronize(stream);
}


/**
* @brief Precalculate the column indices array of C
*
@@ -1346,6 +1350,7 @@ void estimate_nnz_test_v3(csr_t<index_t, offset_t, type_t>& csr,
* @param C Output matrix C (GPU).
* @param stream CUDA stream.
*/
/*
template <typename index_t, typename offset_t, typename type_t>
void precalculate_c_col_indices(csr_t<index_t, offset_t, type_t>& csr,
csc_t<index_t, offset_t, type_t>& csc,
@@ -1375,6 +1380,7 @@ void precalculate_c_col_indices(csr_t<index_t, offset_t, type_t>& csr,

cudaStreamSynchronize(stream);
}
*/

// template <typename offset_t>
void scanNnzC(int* c_nnz_per_tile, int* c_offsets, std::size_t c_rows){
97 changes: 62 additions & 35 deletions include/loops/algorithms/spgemm/thread_mapped.cuh
Original file line number Diff line number Diff line change
@@ -21,6 +21,10 @@
#include <loops/util/device.hxx>
#include <loops/memory.hxx>
#include <iostream>
#include <stdio.h>
#include <cub/cub.cuh>

#include <array>

// #define tests 1
#define SPGEMM_TILE_SIZE 32
@@ -192,17 +196,19 @@ __global__ void __thrad_mapped_row_col_pairs_v1(setup_t config,
__shared__ index_t shared_A_cols[SPGEMM_TILE_SIZE * SPGEMM_TILE_SIZE];
__shared__ index_t shared_B_rows[SPGEMM_TILE_SIZE * SPGEMM_TILE_SIZE];

__shared__ type_t shared_A_values[SPGEMM_TILE_SIZE * SPGEMM_TILE_SIZE];
__shared__ type_t shared_B_values[SPGEMM_TILE_SIZE * SPGEMM_TILE_SIZE];

// Keep track of the column indices of the non-zeros in the m0th row of C, if the colmun as a non-zero element, set the flag to 1, else 0
__shared__ int C_m0_flag[SPGEMM_TILE_SIZE];

int tx = threadIdx.x, bx = blockIdx.x;
// For every block: load ONE row of A into shared memory, load as much of B as possible into shared memory

auto m = bx;

C_m0_flag[tx] = 0;
__syncthreads();

// Load the mth row of A into shared memory
if(m < a_rows){
auto col_arr_start = a_offsets[m];
@@ -211,6 +217,7 @@ __global__ void __thrad_mapped_row_col_pairs_v1(setup_t config,

// Every thread loads one element of the mth row of A into shared memory
shared_A_cols[tx] = a_indices[col_arr_start + tx];
shared_A_values[tx] = a_values[col_arr_start + tx];
__syncthreads();
}

@@ -220,6 +227,7 @@ __global__ void __thrad_mapped_row_col_pairs_v1(setup_t config,
auto row_arr_end = b_offsets[n + 1];
for(int k0 = row_arr_start; k0 < row_arr_end; ++k0){
shared_B_rows[k0] = b_indices[k0];
shared_B_values[k0] = b_values[k0];
}
__syncthreads();

@@ -229,78 +237,95 @@ __global__ void __thrad_mapped_row_col_pairs_v1(setup_t config,
auto start = b_offsets[0];
for(int k0 = 0; k0 < b_nnz; ++k0){
// if(shared_B_rows[k0] != b_indices[start + k0]){
printf("shared_B_rows[%d] = %d b_indices[%d] = %d\n", k0, shared_B_rows[k0], start + k0, b_indices[start + k0]);
printf("shared_B_values[%d] = %f b_values[%d] = %f\n", k0, shared_B_values[k0], start + k0, b_values[start + k0]);
// }
}
}
}*/
}
*/

std::array <int, 8> helperArray;
std::array <int, 10> helperArray;
std::array <float, 4> valueArray;
if(m < a_rows){
int n = tx;
auto row_arr_start = b_offsets[n];
auto row_arr_end = b_offsets[n + 1];

auto sum = 0;
float sum = 0;

for(int row_arr_itr_b = row_arr_start; row_arr_itr_b < row_arr_end; ++row_arr_itr_b){ // Iterate over all the elements in nth column of B

// TODO: Can put this part outside the current for loop?
auto col_arr_start = a_offsets[m];
auto col_arr_end = a_offsets[m + 1];
auto range = col_arr_end - col_arr_start;

for(auto col_arr_itr_a = 0; col_arr_itr_a < range; ++col_arr_itr_a){
if((shared_A_cols[col_arr_itr_a] == shared_B_rows[row_arr_itr_b])){
sum += shared_A_values[col_arr_itr_a] * shared_B_values[row_arr_itr_b];

C_m0_flag[tx] = 1;

}
}
}

sum += a_values[col_arr_itr_a] * b_values[row_arr_itr_b];
/*
if(bx == 30 && sum != 0){
helperArray[0] = m;
helperArray[1] = n;
valueArray[0] = sum;

/*
if(bx == 1){
helperArray[0] = m;
helperArray[1] = n;
helperArray[2] = col_arr_itr_a;
helperArray[3] = shared_A_cols[col_arr_itr_a];
helperArray[4] = row_arr_itr_b - row_arr_start;
helperArray[5] = shared_B_rows[row_arr_itr_b];
helperArray[6] = C_n_nnz_per_block[n];
printf("(m, n): (%d, %d)\nsum: %f\n", helperArray[0], helperArray[1], valueArray[0]);
}
*/

printf("m(bx): %d, n(tx): %d, col_arr_itr_a: %d\nshared_A_cols[%d]: %d, shared_B_rows[%d]: %d\nC_n_nnz_per_block[%d]: %d\n", helperArray[0], helperArray[1], helperArray[2], helperArray[2], helperArray[3], helperArray[4], helperArray[5], helperArray[1], helperArray[6]);
}*/
// C_m0_flag[tx] = (sum != 0);
__syncthreads();

/*
if(bx == 30 && tx == 0){
for(int i = 0; i < SPGEMM_TILE_SIZE; ++i){
if(C_m0_flag[i] == 1){
printf("C_m0_flag[%d]: %d\n", i, C_m0_flag[i]);
}
}
}
*/

C_m0_flag[tx] = (sum != 0);
__syncthreads();
typedef cub::BlockScan<int, SPGEMM_TILE_SIZE> BlockScan;
__shared__ typename BlockScan::TempStorage temp_storage;
int m0_col_idx_c;
BlockScan(temp_storage).InclusiveSum(C_m0_flag[tx], m0_col_idx_c);

typedef cub::BlockReduce<int, SPGEMM_TILE_SIZE> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
int m0_col_idx_c = BlockReduce(temp_storage).Sum(C_m0_flag);
/*
if(bx == 30){
printf("tx: %d, m0_col_idx_c: %d\n", tx, m0_col_idx_c);
}
*/

int col_arr_idx_c = m0_col_idx_c - 1 + a_offsets[m];
int col_arr_idx_c = m0_col_idx_c - 1 + c_offsets[m];

if(C_m0_flag[tx]){
c_indices[col_arr_idx_c] = tx;
c_values[col_arr_idx_c] = sum;
}
__syncthreads();

/*
if(bx == 30 && tx == 0){
for(int i = 0; i < c_offsets[m + 1] - c_offsets[m]; ++i){
printf("c_indices[%d]: %d, c_values[%d]: %f\n", i, c_indices[i], i, c_values[i]);
}
}
*/

}
__syncthreads();
}


// typedef cub::BlockReduce<int, SPGEMM_TILE_SIZE> BlockReduce;
// __shared__ typename BlockReduce::TempStorage temp_storage;
// int C_nnz_per_row = BlockReduce(temp_storage).Sum(C_n_nnz);

// for(int i = 0; i < gridDim.x; ++i){
// if(bx == i && tx == 0){
// printf("bx: %d, C_nnz_per_row: %d\n", bx, C_nnz_per_row);
// }
// }

}

/*
template <typename setup_t,
@@ -583,7 +608,9 @@ void thread_mapped_v2(csr_t<index_t, offset_t, type_t>& csr,
index_t, offset_t>;
setup_t config(csr.offsets.data().get(), csr.rows, csr.nnzs);

std::size_t grid_size = (csr.rows + block_size - 1) / block_size;
// std::size_t grid_size = (csr.rows + block_size - 1) / block_size;
std::size_t grid_size = 32;

printf("grid_size: %ld\n", grid_size);

launch::non_cooperative(