Skip to content

Realm gpu #51

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: cuda
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
193 changes: 184 additions & 9 deletions core/core.cc
Original file line number Diff line number Diff line change
Expand Up @@ -80,11 +80,42 @@ void Kernel::execute(long graph_index, long timestep, long point,
assert(timestep >= 0 && point >= 0);
execute_kernel_imbalance(*this, graph_index, timestep, point);
break;
default:
assert(false && "unimplemented kernel type");
};
}

void Kernel::execute(long graph_index, long timestep, long point,
char *scratch_ptr, size_t scratch_bytes, int gpu_id) const
{
switch(type) {
case KernelType::EMPTY:
execute_kernel_empty(*this);
break;
#ifdef ENABLE_CUDA
case KernelType::CUDA_COMPUTE_BOUND:
assert(scratch_ptr != NULL);
assert(scratch_bytes > 0);
execute_kernel_compute_cuda(*this, scratch_ptr, scratch_bytes);
execute_kernel_compute_cuda(*this, scratch_ptr, scratch_bytes, gpu_id);
break;
#endif
default:
assert(false && "unimplemented kernel type");
};
}

void Kernel::execute(long graph_index, long timestep, long point,
char *scratch_ptr, size_t scratch_bytes, char *device_ptr, size_t device_bytes) const
{
switch(type) {
case KernelType::EMPTY:
execute_kernel_empty(*this);
break;
#ifdef ENABLE_CUDA
case KernelType::CUDA_COMPUTE_BOUND:
assert(scratch_ptr != NULL);
assert(scratch_bytes > 0);
execute_kernel_compute_cuda(*this, scratch_ptr, scratch_bytes, device_ptr, device_bytes);
break;
#endif
default:
Expand Down Expand Up @@ -633,6 +664,156 @@ void TaskGraph::execute_point(long timestep, long point,
k.execute(graph_index, timestep, point, scratch_ptr, scratch_bytes);
}

void TaskGraph::execute_point(long timestep, long point,
char *output_ptr, size_t output_bytes,
const char **input_ptr, const size_t *input_bytes,
size_t n_inputs,
char *scratch_ptr, size_t scratch_bytes,
int gpu_id) const
{
#ifdef DEBUG_CORE
// Validate graph_index
assert(graph_index >= 0 && graph_index < sizeof(TaskGraphMask)*8);
has_executed_graph |= 1 << graph_index;
#endif

// Validate timestep and point
assert(0 <= timestep && timestep < timesteps);

long offset = offset_at_timestep(timestep);
long width = width_at_timestep(timestep);
assert(offset <= point && point < offset+width);

long last_offset = offset_at_timestep(timestep-1);
long last_width = width_at_timestep(timestep-1);

// Validate input
{
size_t idx = 0;
long dset = dependence_set_at_timestep(timestep);
size_t max_deps = num_dependencies(dset, point);
std::pair<long, long> *deps = reinterpret_cast<std::pair<long, long> *>(alloca(sizeof(std::pair<long, long>) * max_deps));
size_t num_deps = dependencies(dset, point, deps);
for (size_t span = 0; span < num_deps; span++) {
for (long dep = deps[span].first; dep <= deps[span].second; dep++) {
if (last_offset <= dep && dep < last_offset + last_width) {
assert(idx < n_inputs);

assert(input_bytes[idx] == output_bytes_per_task);
assert(input_bytes[idx] >= sizeof(std::pair<long, long>));

const std::pair<long, long> *input = reinterpret_cast<const std::pair<long, long> *>(input_ptr[idx]);
for (size_t i = 0; i < input_bytes[idx]/sizeof(std::pair<long, long>); ++i) {
assert(input[i].first == timestep - 1);
assert(input[i].second == dep);
}
idx++;
}
}
}
// FIXME (Elliott): Legion is currently passing in uninitialized
// memory for dependencies outside of the last offset/width.
// assert(idx == n_inputs);
}

// Validate output
assert(output_bytes == output_bytes_per_task);
assert(output_bytes >= sizeof(std::pair<long, long>));

// Generate output
std::pair<long, long> *output = reinterpret_cast<std::pair<long, long> *>(output_ptr);
for (size_t i = 0; i < output_bytes/sizeof(std::pair<long, long>); ++i) {
output[i].first = timestep;
output[i].second = point;
}

// Validate scratch
assert(scratch_bytes == scratch_bytes_per_task);
if (scratch_bytes > 0) {
uint64_t *scratch = reinterpret_cast<uint64_t *>(scratch_ptr);
assert(*scratch == MAGIC_VALUE);
}

// Execute kernel
Kernel k(kernel);
k.execute(graph_index, timestep, point, scratch_ptr, scratch_bytes, gpu_id);
}

void TaskGraph::execute_point(long timestep, long point,
char *output_ptr, size_t output_bytes,
const char **input_ptr, const size_t *input_bytes,
size_t n_inputs,
char *scratch_ptr, size_t scratch_bytes,
char *device_ptr, size_t device_bytes) const
{
#ifdef DEBUG_CORE
// Validate graph_index
assert(graph_index >= 0 && graph_index < sizeof(TaskGraphMask)*8);
has_executed_graph |= 1 << graph_index;
#endif

// Validate timestep and point
assert(0 <= timestep && timestep < timesteps);

long offset = offset_at_timestep(timestep);
long width = width_at_timestep(timestep);
assert(offset <= point && point < offset+width);

long last_offset = offset_at_timestep(timestep-1);
long last_width = width_at_timestep(timestep-1);

// Validate input
{
size_t idx = 0;
long dset = dependence_set_at_timestep(timestep);
size_t max_deps = num_dependencies(dset, point);
std::pair<long, long> *deps = reinterpret_cast<std::pair<long, long> *>(alloca(sizeof(std::pair<long, long>) * max_deps));
size_t num_deps = dependencies(dset, point, deps);
for (size_t span = 0; span < num_deps; span++) {
for (long dep = deps[span].first; dep <= deps[span].second; dep++) {
if (last_offset <= dep && dep < last_offset + last_width) {
assert(idx < n_inputs);

assert(input_bytes[idx] == output_bytes_per_task);
assert(input_bytes[idx] >= sizeof(std::pair<long, long>));

const std::pair<long, long> *input = reinterpret_cast<const std::pair<long, long> *>(input_ptr[idx]);
for (size_t i = 0; i < input_bytes[idx]/sizeof(std::pair<long, long>); ++i) {
assert(input[i].first == timestep - 1);
assert(input[i].second == dep);
}
idx++;
}
}
}
// FIXME (Elliott): Legion is currently passing in uninitialized
// memory for dependencies outside of the last offset/width.
// assert(idx == n_inputs);
}

// Validate output
assert(output_bytes == output_bytes_per_task);
assert(output_bytes >= sizeof(std::pair<long, long>));

// Generate output
std::pair<long, long> *output = reinterpret_cast<std::pair<long, long> *>(output_ptr);
for (size_t i = 0; i < output_bytes/sizeof(std::pair<long, long>); ++i) {
output[i].first = timestep;
output[i].second = point;
}

// Validate scratch
assert(scratch_bytes == scratch_bytes_per_task);
if (scratch_bytes > 0) {
uint64_t *scratch = reinterpret_cast<uint64_t *>(scratch_ptr);
assert(*scratch == MAGIC_VALUE);
}

// Execute kernel
Kernel k(kernel);
k.execute(graph_index, timestep, point, scratch_ptr, scratch_bytes, device_ptr, device_bytes);
}

void TaskGraph::prepare_scratch(char *scratch_ptr, size_t scratch_bytes)
{
assert(scratch_bytes % sizeof(uint64_t) == 0);
Expand All @@ -654,7 +835,7 @@ static TaskGraph default_graph(long graph_index)
graph.period = -1;
graph.fraction_connected = 0.25;
#ifdef ENABLE_CUDA
graph.kernel = {KernelType::EMPTY, 0, 16, 0.0, 1, 32, 0, 0, 1};
graph.kernel = {KernelType::EMPTY, 0, 16, 0.0, 1, 32, 0, 1};
#else
graph.kernel = {KernelType::EMPTY, 0, 16, 0.0};
#endif
Expand Down Expand Up @@ -962,9 +1143,6 @@ App::App(int argc, char **argv)

check();

#ifdef ENABLE_CUDA
init_cuda_support(graphs);
#endif
}

void App::check() const
Expand Down Expand Up @@ -1253,8 +1431,5 @@ void App::report_timing(double elapsed_seconds) const
#ifdef DEBUG_CORE
printf("Task Graph Execution Mask %llx\n", has_executed_graph.load());
#endif

#ifdef ENABLE_CUDA
fini_cuda_support();
#endif

}
16 changes: 16 additions & 0 deletions core/core.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ struct Kernel : public kernel_t {
private:
void execute(long graph_index, long timestep, long point,
char *scratch_ptr, size_t scratch_bytes) const;
void execute(long graph_index, long timestep, long point,
char *scratch_ptr, size_t scratch_bytes, int gpu_id) const;
void execute(long graph_index, long timestep, long point,
char *scratch_ptr, size_t scratch_bytes, char* device_ptr, size_t device_bytes) const;
friend struct TaskGraph;
};

Expand Down Expand Up @@ -69,6 +73,18 @@ struct TaskGraph : public task_graph_t {
const char **input_ptr, const size_t *input_bytes,
size_t n_inputs,
char *scratch_ptr, size_t scratch_bytes) const;
void execute_point(long timestep, long point,
char *output_ptr, size_t output_bytes,
const char **input_ptr, const size_t *input_bytes,
size_t n_inputs,
char *scratch_ptr, size_t scratch_bytes,
int gpu_id) const;
void execute_point(long timestep, long point,
char *output_ptr, size_t output_bytes,
const char **input_ptr, const size_t *input_bytes,
size_t n_inputs,
char *scratch_ptr, size_t scratch_bytes,
char *device_ptr, size_t device_bytes) const;
static void prepare_scratch(char *scratch_ptr, size_t scratch_bytes);
};

Expand Down
1 change: 0 additions & 1 deletion core/core_c.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ typedef struct kernel_t {
int nb_blocks;
int threads_per_block;
int memcpy_required;
int gpu_id;
int cuda_unroll;
#endif
} kernel_t;
Expand Down
72 changes: 49 additions & 23 deletions core/cuda_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
#include "cuda_kernel.h"

std::vector<char*> local_buffer;
int nb_local_buffer = 0;
size_t local_buffer_size;
cudaStream_t cuda_stream_array[8];

Expand All @@ -27,61 +26,88 @@ __global__ void execute_kernel_compute_cuda_kernel_unroll_4(long iter, double *A
__global__ void execute_kernel_compute_cuda_kernel_unroll_8(long iter, double *A);
__global__ void execute_kernel_compute_cuda_kernel_unroll_16(long iter, double *A);

void init_cuda_support(const std::vector<TaskGraph> &graphs)
void init_cuda_support(const std::vector<TaskGraph> &graphs, const std::vector<int> &local_gpus)
{
int nb_gpus = 1;
int nb_gpus = local_gpus.size();

nb_local_buffer = nb_gpus;
local_buffer.reserve(nb_local_buffer);
local_buffer.reserve(nb_gpus);
int nb_blocks = graphs[0].kernel.nb_blocks;
int threads_per_block = graphs[0].kernel.threads_per_block;
int cuda_unroll = graphs[0].kernel.cuda_unroll;
printf("init cuda support nb_blocks %d, threads_per_block %d, cuda_unroll %d\n", nb_blocks, threads_per_block, cuda_unroll);
// printf("init cuda support nb_gpus %d nb_blocks %d, threads_per_block %d, cuda_unroll %d\n", nb_gpus, nb_blocks, threads_per_block, cuda_unroll);
local_buffer_size = nb_blocks * threads_per_block * sizeof(double);
for (int i = 0; i < nb_gpus; i++) {
gpuErrchk( cudaSetDevice(0) );
gpuErrchk( cudaSetDevice(local_gpus[i]) );
gpuErrchk( cudaMalloc((void**)&(local_buffer[i]), sizeof(double) * nb_blocks * threads_per_block * cuda_unroll) );
assert(local_buffer[i] != NULL);
gpuErrchk( cudaStreamCreate(&(cuda_stream_array[i])) );
}
}

void fini_cuda_support()
void fini_cuda_support(const std::vector<int> &local_gpus)
{
for (int i = 0; i < nb_local_buffer; i++) {
gpuErrchk( cudaSetDevice(0) );
// printf("fini cuda support nb_gpus %d\n", local_gpus.size());
for (int i = 0; i < local_buffer.size(); i++) {
gpuErrchk( cudaSetDevice(local_gpus[i]) );
gpuErrchk( cudaFree(local_buffer[i]) );
local_buffer[i] = NULL;
gpuErrchk( cudaStreamDestroy(cuda_stream_array[i]) );
}
local_buffer.clear();
}

void execute_kernel_compute_cuda(const Kernel &kernel, char *scratch_ptr, size_t scratch_bytes)
void execute_kernel_compute_cuda(const Kernel &kernel, char *scratch_ptr, size_t scratch_bytes, int gpu_id)
{
// printf("CUDA COMPUTE KERNEL buffer %p, size %lld, nb_blocks %d, threads_per_block %d\n", scratch_ptr, scratch_bytes, kernel.nb_blocks, kernel.threads_per_block);
// printf("CUDA COMPUTE KERNEL buffer %p, size %lld, nb_blocks %d, threads_per_block %d\n", scratch_ptr, scratch_bytes, kernel.nb_blocks, kernel.threads_per_block);
assert(scratch_bytes <= local_buffer_size);
assert(kernel.gpu_id == 0);


if (kernel.memcpy_required == 1) {
// printf("enable memcpy in\n");
gpuErrchk( cudaMemcpyAsync(local_buffer[gpu_id], scratch_ptr, scratch_bytes, cudaMemcpyHostToDevice, cuda_stream_array[gpu_id]) );
gpuErrchk( cudaStreamSynchronize(cuda_stream_array[gpu_id]) );
}
if (kernel.cuda_unroll == 4) {
execute_kernel_compute_cuda_kernel_unroll_4<<<kernel.nb_blocks, kernel.threads_per_block, 0, cuda_stream_array[gpu_id]>>>(kernel.iterations, (double *)local_buffer[gpu_id]);
} else if (kernel.cuda_unroll == 8) {
execute_kernel_compute_cuda_kernel_unroll_8<<<kernel.nb_blocks, kernel.threads_per_block, 0, cuda_stream_array[gpu_id]>>>(kernel.iterations, (double *)local_buffer[gpu_id]);
} else if (kernel.cuda_unroll == 16) {
execute_kernel_compute_cuda_kernel_unroll_16<<<kernel.nb_blocks, kernel.threads_per_block, 0, cuda_stream_array[gpu_id]>>>(kernel.iterations, (double *)local_buffer[gpu_id]);
} else {
execute_kernel_compute_cuda_kernel_unroll_1<<<kernel.nb_blocks, kernel.threads_per_block, 0, cuda_stream_array[gpu_id]>>>(kernel.iterations, (double *)local_buffer[gpu_id]);
}
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaStreamSynchronize(cuda_stream_array[gpu_id]) );
if (kernel.memcpy_required == 1) {
// printf("enable memcpy out\n");
gpuErrchk( cudaMemcpyAsync(scratch_ptr, local_buffer[gpu_id], scratch_bytes, cudaMemcpyDeviceToHost, cuda_stream_array[gpu_id]) );
gpuErrchk( cudaStreamSynchronize(cuda_stream_array[gpu_id]) );
}
}

void execute_kernel_compute_cuda(const Kernel &kernel, char *scratch_ptr, size_t scratch_bytes, char *device_ptr, size_t device_bytes) {
printf("CUDA COMPUTE KERNEL scratch_ptr %p, size %lld, nb_blocks %d, threads_per_block %d, device_ptr %p, size %lld\n", scratch_ptr, scratch_bytes, kernel.nb_blocks, kernel.threads_per_block, device_ptr, device_bytes);
assert(scratch_bytes <= device_bytes);

if (kernel.memcpy_required == 1) {
// printf("enable memcpy in\n");
gpuErrchk( cudaMemcpyAsync(local_buffer[kernel.gpu_id], scratch_ptr, scratch_bytes, cudaMemcpyHostToDevice, cuda_stream_array[kernel.gpu_id]) );
gpuErrchk( cudaStreamSynchronize(cuda_stream_array[kernel.gpu_id]) );
gpuErrchk( cudaMemcpyAsync(device_ptr, scratch_ptr, scratch_bytes, cudaMemcpyHostToDevice, 0) );
gpuErrchk( cudaStreamSynchronize(0) );
}
if (kernel.cuda_unroll == 4) {
execute_kernel_compute_cuda_kernel_unroll_4<<<kernel.nb_blocks, kernel.threads_per_block, 0, cuda_stream_array[kernel.gpu_id]>>>(kernel.iterations, (double *)local_buffer[kernel.gpu_id]);
execute_kernel_compute_cuda_kernel_unroll_4<<<kernel.nb_blocks, kernel.threads_per_block, 0, 0>>>(kernel.iterations, (double *)device_ptr);
} else if (kernel.cuda_unroll == 8) {
execute_kernel_compute_cuda_kernel_unroll_8<<<kernel.nb_blocks, kernel.threads_per_block, 0, cuda_stream_array[kernel.gpu_id]>>>(kernel.iterations, (double *)local_buffer[kernel.gpu_id]);
execute_kernel_compute_cuda_kernel_unroll_8<<<kernel.nb_blocks, kernel.threads_per_block, 0, 0>>>(kernel.iterations, (double *)device_ptr);
} else if (kernel.cuda_unroll == 16) {
execute_kernel_compute_cuda_kernel_unroll_16<<<kernel.nb_blocks, kernel.threads_per_block, 0, cuda_stream_array[kernel.gpu_id]>>>(kernel.iterations, (double *)local_buffer[kernel.gpu_id]);
execute_kernel_compute_cuda_kernel_unroll_16<<<kernel.nb_blocks, kernel.threads_per_block, 0, 0>>>(kernel.iterations, (double *)device_ptr);
} else {
execute_kernel_compute_cuda_kernel_unroll_1<<<kernel.nb_blocks, kernel.threads_per_block, 0, cuda_stream_array[kernel.gpu_id]>>>(kernel.iterations, (double *)local_buffer[kernel.gpu_id]);
execute_kernel_compute_cuda_kernel_unroll_1<<<kernel.nb_blocks, kernel.threads_per_block, 0, 0>>>(kernel.iterations, (double *)device_ptr);
}
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaStreamSynchronize(cuda_stream_array[kernel.gpu_id]) );
gpuErrchk( cudaStreamSynchronize(0) );
if (kernel.memcpy_required == 1) {
// printf("enable memcpy out\n");
gpuErrchk( cudaMemcpyAsync(scratch_ptr, local_buffer[kernel.gpu_id], scratch_bytes, cudaMemcpyDeviceToHost, cuda_stream_array[kernel.gpu_id]) );
gpuErrchk( cudaStreamSynchronize(cuda_stream_array[kernel.gpu_id]) );
gpuErrchk( cudaMemcpyAsync(scratch_ptr, device_ptr, scratch_bytes, cudaMemcpyDeviceToHost, 0) );
gpuErrchk( cudaStreamSynchronize(0) );
}
}

Expand Down
Loading