diff --git a/core/core.cc b/core/core.cc index 2b6c2f0c..61feb5f1 100644 --- a/core/core.cc +++ b/core/core.cc @@ -80,11 +80,42 @@ void Kernel::execute(long graph_index, long timestep, long point, assert(timestep >= 0 && point >= 0); execute_kernel_imbalance(*this, graph_index, timestep, point); break; + default: + assert(false && "unimplemented kernel type"); + }; +} + +void Kernel::execute(long graph_index, long timestep, long point, + char *scratch_ptr, size_t scratch_bytes, int gpu_id) const +{ + switch(type) { + case KernelType::EMPTY: + execute_kernel_empty(*this); + break; #ifdef ENABLE_CUDA case KernelType::CUDA_COMPUTE_BOUND: assert(scratch_ptr != NULL); assert(scratch_bytes > 0); - execute_kernel_compute_cuda(*this, scratch_ptr, scratch_bytes); + execute_kernel_compute_cuda(*this, scratch_ptr, scratch_bytes, gpu_id); + break; +#endif + default: + assert(false && "unimplemented kernel type"); + }; +} + +void Kernel::execute(long graph_index, long timestep, long point, + char *scratch_ptr, size_t scratch_bytes, char *device_ptr, size_t device_bytes) const +{ + switch(type) { + case KernelType::EMPTY: + execute_kernel_empty(*this); + break; +#ifdef ENABLE_CUDA + case KernelType::CUDA_COMPUTE_BOUND: + assert(scratch_ptr != NULL); + assert(scratch_bytes > 0); + execute_kernel_compute_cuda(*this, scratch_ptr, scratch_bytes, device_ptr, device_bytes); break; #endif default: @@ -633,6 +664,156 @@ void TaskGraph::execute_point(long timestep, long point, k.execute(graph_index, timestep, point, scratch_ptr, scratch_bytes); } +void TaskGraph::execute_point(long timestep, long point, + char *output_ptr, size_t output_bytes, + const char **input_ptr, const size_t *input_bytes, + size_t n_inputs, + char *scratch_ptr, size_t scratch_bytes, + int gpu_id) const +{ +#ifdef DEBUG_CORE + // Validate graph_index + assert(graph_index >= 0 && graph_index < sizeof(TaskGraphMask)*8); + has_executed_graph |= 1 << graph_index; +#endif + + // Validate timestep and point + assert(0 <= timestep && timestep < timesteps); + + long offset = offset_at_timestep(timestep); + long width = width_at_timestep(timestep); + assert(offset <= point && point < offset+width); + + long last_offset = offset_at_timestep(timestep-1); + long last_width = width_at_timestep(timestep-1); + + // Validate input + { + size_t idx = 0; + long dset = dependence_set_at_timestep(timestep); + size_t max_deps = num_dependencies(dset, point); + std::pair *deps = reinterpret_cast *>(alloca(sizeof(std::pair) * max_deps)); + size_t num_deps = dependencies(dset, point, deps); + for (size_t span = 0; span < num_deps; span++) { + for (long dep = deps[span].first; dep <= deps[span].second; dep++) { + if (last_offset <= dep && dep < last_offset + last_width) { + assert(idx < n_inputs); + + assert(input_bytes[idx] == output_bytes_per_task); + assert(input_bytes[idx] >= sizeof(std::pair)); + + const std::pair *input = reinterpret_cast *>(input_ptr[idx]); + for (size_t i = 0; i < input_bytes[idx]/sizeof(std::pair); ++i) { + assert(input[i].first == timestep - 1); + assert(input[i].second == dep); + } + idx++; + } + } + } + // FIXME (Elliott): Legion is currently passing in uninitialized + // memory for dependencies outside of the last offset/width. + // assert(idx == n_inputs); + } + + // Validate output + assert(output_bytes == output_bytes_per_task); + assert(output_bytes >= sizeof(std::pair)); + + // Generate output + std::pair *output = reinterpret_cast *>(output_ptr); + for (size_t i = 0; i < output_bytes/sizeof(std::pair); ++i) { + output[i].first = timestep; + output[i].second = point; + } + + // Validate scratch + assert(scratch_bytes == scratch_bytes_per_task); + if (scratch_bytes > 0) { + uint64_t *scratch = reinterpret_cast(scratch_ptr); + assert(*scratch == MAGIC_VALUE); + } + + // Execute kernel + Kernel k(kernel); + k.execute(graph_index, timestep, point, scratch_ptr, scratch_bytes, gpu_id); +} + +void TaskGraph::execute_point(long timestep, long point, + char *output_ptr, size_t output_bytes, + const char **input_ptr, const size_t *input_bytes, + size_t n_inputs, + char *scratch_ptr, size_t scratch_bytes, + char *device_ptr, size_t device_bytes) const +{ +#ifdef DEBUG_CORE + // Validate graph_index + assert(graph_index >= 0 && graph_index < sizeof(TaskGraphMask)*8); + has_executed_graph |= 1 << graph_index; +#endif + + // Validate timestep and point + assert(0 <= timestep && timestep < timesteps); + + long offset = offset_at_timestep(timestep); + long width = width_at_timestep(timestep); + assert(offset <= point && point < offset+width); + + long last_offset = offset_at_timestep(timestep-1); + long last_width = width_at_timestep(timestep-1); + + // Validate input + { + size_t idx = 0; + long dset = dependence_set_at_timestep(timestep); + size_t max_deps = num_dependencies(dset, point); + std::pair *deps = reinterpret_cast *>(alloca(sizeof(std::pair) * max_deps)); + size_t num_deps = dependencies(dset, point, deps); + for (size_t span = 0; span < num_deps; span++) { + for (long dep = deps[span].first; dep <= deps[span].second; dep++) { + if (last_offset <= dep && dep < last_offset + last_width) { + assert(idx < n_inputs); + + assert(input_bytes[idx] == output_bytes_per_task); + assert(input_bytes[idx] >= sizeof(std::pair)); + + const std::pair *input = reinterpret_cast *>(input_ptr[idx]); + for (size_t i = 0; i < input_bytes[idx]/sizeof(std::pair); ++i) { + assert(input[i].first == timestep - 1); + assert(input[i].second == dep); + } + idx++; + } + } + } + // FIXME (Elliott): Legion is currently passing in uninitialized + // memory for dependencies outside of the last offset/width. + // assert(idx == n_inputs); + } + + // Validate output + assert(output_bytes == output_bytes_per_task); + assert(output_bytes >= sizeof(std::pair)); + + // Generate output + std::pair *output = reinterpret_cast *>(output_ptr); + for (size_t i = 0; i < output_bytes/sizeof(std::pair); ++i) { + output[i].first = timestep; + output[i].second = point; + } + + // Validate scratch + assert(scratch_bytes == scratch_bytes_per_task); + if (scratch_bytes > 0) { + uint64_t *scratch = reinterpret_cast(scratch_ptr); + assert(*scratch == MAGIC_VALUE); + } + + // Execute kernel + Kernel k(kernel); + k.execute(graph_index, timestep, point, scratch_ptr, scratch_bytes, device_ptr, device_bytes); +} + void TaskGraph::prepare_scratch(char *scratch_ptr, size_t scratch_bytes) { assert(scratch_bytes % sizeof(uint64_t) == 0); @@ -654,7 +835,7 @@ static TaskGraph default_graph(long graph_index) graph.period = -1; graph.fraction_connected = 0.25; #ifdef ENABLE_CUDA - graph.kernel = {KernelType::EMPTY, 0, 16, 0.0, 1, 32, 0, 0, 1}; + graph.kernel = {KernelType::EMPTY, 0, 16, 0.0, 1, 32, 0, 1}; #else graph.kernel = {KernelType::EMPTY, 0, 16, 0.0}; #endif @@ -962,9 +1143,6 @@ App::App(int argc, char **argv) check(); -#ifdef ENABLE_CUDA - init_cuda_support(graphs); -#endif } void App::check() const @@ -1253,8 +1431,5 @@ void App::report_timing(double elapsed_seconds) const #ifdef DEBUG_CORE printf("Task Graph Execution Mask %llx\n", has_executed_graph.load()); #endif - -#ifdef ENABLE_CUDA - fini_cuda_support(); -#endif + } diff --git a/core/core.h b/core/core.h index 5b8da472..85866851 100644 --- a/core/core.h +++ b/core/core.h @@ -34,6 +34,10 @@ struct Kernel : public kernel_t { private: void execute(long graph_index, long timestep, long point, char *scratch_ptr, size_t scratch_bytes) const; + void execute(long graph_index, long timestep, long point, + char *scratch_ptr, size_t scratch_bytes, int gpu_id) const; + void execute(long graph_index, long timestep, long point, + char *scratch_ptr, size_t scratch_bytes, char* device_ptr, size_t device_bytes) const; friend struct TaskGraph; }; @@ -69,6 +73,18 @@ struct TaskGraph : public task_graph_t { const char **input_ptr, const size_t *input_bytes, size_t n_inputs, char *scratch_ptr, size_t scratch_bytes) const; + void execute_point(long timestep, long point, + char *output_ptr, size_t output_bytes, + const char **input_ptr, const size_t *input_bytes, + size_t n_inputs, + char *scratch_ptr, size_t scratch_bytes, + int gpu_id) const; + void execute_point(long timestep, long point, + char *output_ptr, size_t output_bytes, + const char **input_ptr, const size_t *input_bytes, + size_t n_inputs, + char *scratch_ptr, size_t scratch_bytes, + char *device_ptr, size_t device_bytes) const; static void prepare_scratch(char *scratch_ptr, size_t scratch_bytes); }; diff --git a/core/core_c.h b/core/core_c.h index 5f3847c4..7a2bac5c 100644 --- a/core/core_c.h +++ b/core/core_c.h @@ -63,7 +63,6 @@ typedef struct kernel_t { int nb_blocks; int threads_per_block; int memcpy_required; - int gpu_id; int cuda_unroll; #endif } kernel_t; diff --git a/core/cuda_kernel.cu b/core/cuda_kernel.cu index 1c3a375d..714cc856 100644 --- a/core/cuda_kernel.cu +++ b/core/cuda_kernel.cu @@ -3,7 +3,6 @@ #include "cuda_kernel.h" std::vector local_buffer; -int nb_local_buffer = 0; size_t local_buffer_size; cudaStream_t cuda_stream_array[8]; @@ -27,61 +26,88 @@ __global__ void execute_kernel_compute_cuda_kernel_unroll_4(long iter, double *A __global__ void execute_kernel_compute_cuda_kernel_unroll_8(long iter, double *A); __global__ void execute_kernel_compute_cuda_kernel_unroll_16(long iter, double *A); -void init_cuda_support(const std::vector &graphs) +void init_cuda_support(const std::vector &graphs, const std::vector &local_gpus) { - int nb_gpus = 1; + int nb_gpus = local_gpus.size(); - nb_local_buffer = nb_gpus; - local_buffer.reserve(nb_local_buffer); + local_buffer.reserve(nb_gpus); int nb_blocks = graphs[0].kernel.nb_blocks; int threads_per_block = graphs[0].kernel.threads_per_block; int cuda_unroll = graphs[0].kernel.cuda_unroll; - printf("init cuda support nb_blocks %d, threads_per_block %d, cuda_unroll %d\n", nb_blocks, threads_per_block, cuda_unroll); + // printf("init cuda support nb_gpus %d nb_blocks %d, threads_per_block %d, cuda_unroll %d\n", nb_gpus, nb_blocks, threads_per_block, cuda_unroll); local_buffer_size = nb_blocks * threads_per_block * sizeof(double); for (int i = 0; i < nb_gpus; i++) { - gpuErrchk( cudaSetDevice(0) ); + gpuErrchk( cudaSetDevice(local_gpus[i]) ); gpuErrchk( cudaMalloc((void**)&(local_buffer[i]), sizeof(double) * nb_blocks * threads_per_block * cuda_unroll) ); assert(local_buffer[i] != NULL); gpuErrchk( cudaStreamCreate(&(cuda_stream_array[i])) ); } } -void fini_cuda_support() +void fini_cuda_support(const std::vector &local_gpus) { - for (int i = 0; i < nb_local_buffer; i++) { - gpuErrchk( cudaSetDevice(0) ); + // printf("fini cuda support nb_gpus %d\n", local_gpus.size()); + for (int i = 0; i < local_buffer.size(); i++) { + gpuErrchk( cudaSetDevice(local_gpus[i]) ); gpuErrchk( cudaFree(local_buffer[i]) ); local_buffer[i] = NULL; gpuErrchk( cudaStreamDestroy(cuda_stream_array[i]) ); } + local_buffer.clear(); } -void execute_kernel_compute_cuda(const Kernel &kernel, char *scratch_ptr, size_t scratch_bytes) +void execute_kernel_compute_cuda(const Kernel &kernel, char *scratch_ptr, size_t scratch_bytes, int gpu_id) { -// printf("CUDA COMPUTE KERNEL buffer %p, size %lld, nb_blocks %d, threads_per_block %d\n", scratch_ptr, scratch_bytes, kernel.nb_blocks, kernel.threads_per_block); + // printf("CUDA COMPUTE KERNEL buffer %p, size %lld, nb_blocks %d, threads_per_block %d\n", scratch_ptr, scratch_bytes, kernel.nb_blocks, kernel.threads_per_block); assert(scratch_bytes <= local_buffer_size); - assert(kernel.gpu_id == 0); - + + if (kernel.memcpy_required == 1) { + // printf("enable memcpy in\n"); + gpuErrchk( cudaMemcpyAsync(local_buffer[gpu_id], scratch_ptr, scratch_bytes, cudaMemcpyHostToDevice, cuda_stream_array[gpu_id]) ); + gpuErrchk( cudaStreamSynchronize(cuda_stream_array[gpu_id]) ); + } + if (kernel.cuda_unroll == 4) { + execute_kernel_compute_cuda_kernel_unroll_4<<>>(kernel.iterations, (double *)local_buffer[gpu_id]); + } else if (kernel.cuda_unroll == 8) { + execute_kernel_compute_cuda_kernel_unroll_8<<>>(kernel.iterations, (double *)local_buffer[gpu_id]); + } else if (kernel.cuda_unroll == 16) { + execute_kernel_compute_cuda_kernel_unroll_16<<>>(kernel.iterations, (double *)local_buffer[gpu_id]); + } else { + execute_kernel_compute_cuda_kernel_unroll_1<<>>(kernel.iterations, (double *)local_buffer[gpu_id]); + } + gpuErrchk( cudaPeekAtLastError() ); + gpuErrchk( cudaStreamSynchronize(cuda_stream_array[gpu_id]) ); + if (kernel.memcpy_required == 1) { + // printf("enable memcpy out\n"); + gpuErrchk( cudaMemcpyAsync(scratch_ptr, local_buffer[gpu_id], scratch_bytes, cudaMemcpyDeviceToHost, cuda_stream_array[gpu_id]) ); + gpuErrchk( cudaStreamSynchronize(cuda_stream_array[gpu_id]) ); + } +} + +void execute_kernel_compute_cuda(const Kernel &kernel, char *scratch_ptr, size_t scratch_bytes, char *device_ptr, size_t device_bytes) { + printf("CUDA COMPUTE KERNEL scratch_ptr %p, size %lld, nb_blocks %d, threads_per_block %d, device_ptr %p, size %lld\n", scratch_ptr, scratch_bytes, kernel.nb_blocks, kernel.threads_per_block, device_ptr, device_bytes); + assert(scratch_bytes <= device_bytes); + if (kernel.memcpy_required == 1) { // printf("enable memcpy in\n"); - gpuErrchk( cudaMemcpyAsync(local_buffer[kernel.gpu_id], scratch_ptr, scratch_bytes, cudaMemcpyHostToDevice, cuda_stream_array[kernel.gpu_id]) ); - gpuErrchk( cudaStreamSynchronize(cuda_stream_array[kernel.gpu_id]) ); + gpuErrchk( cudaMemcpyAsync(device_ptr, scratch_ptr, scratch_bytes, cudaMemcpyHostToDevice, 0) ); + gpuErrchk( cudaStreamSynchronize(0) ); } if (kernel.cuda_unroll == 4) { - execute_kernel_compute_cuda_kernel_unroll_4<<>>(kernel.iterations, (double *)local_buffer[kernel.gpu_id]); + execute_kernel_compute_cuda_kernel_unroll_4<<>>(kernel.iterations, (double *)device_ptr); } else if (kernel.cuda_unroll == 8) { - execute_kernel_compute_cuda_kernel_unroll_8<<>>(kernel.iterations, (double *)local_buffer[kernel.gpu_id]); + execute_kernel_compute_cuda_kernel_unroll_8<<>>(kernel.iterations, (double *)device_ptr); } else if (kernel.cuda_unroll == 16) { - execute_kernel_compute_cuda_kernel_unroll_16<<>>(kernel.iterations, (double *)local_buffer[kernel.gpu_id]); + execute_kernel_compute_cuda_kernel_unroll_16<<>>(kernel.iterations, (double *)device_ptr); } else { - execute_kernel_compute_cuda_kernel_unroll_1<<>>(kernel.iterations, (double *)local_buffer[kernel.gpu_id]); + execute_kernel_compute_cuda_kernel_unroll_1<<>>(kernel.iterations, (double *)device_ptr); } gpuErrchk( cudaPeekAtLastError() ); - gpuErrchk( cudaStreamSynchronize(cuda_stream_array[kernel.gpu_id]) ); + gpuErrchk( cudaStreamSynchronize(0) ); if (kernel.memcpy_required == 1) { // printf("enable memcpy out\n"); - gpuErrchk( cudaMemcpyAsync(scratch_ptr, local_buffer[kernel.gpu_id], scratch_bytes, cudaMemcpyDeviceToHost, cuda_stream_array[kernel.gpu_id]) ); - gpuErrchk( cudaStreamSynchronize(cuda_stream_array[kernel.gpu_id]) ); + gpuErrchk( cudaMemcpyAsync(scratch_ptr, device_ptr, scratch_bytes, cudaMemcpyDeviceToHost, 0) ); + gpuErrchk( cudaStreamSynchronize(0) ); } } diff --git a/core/cuda_kernel.h b/core/cuda_kernel.h index 4a6ba2a8..caea05a6 100644 --- a/core/cuda_kernel.h +++ b/core/cuda_kernel.h @@ -11,10 +11,12 @@ extern std::vector local_buffer; extern size_t local_buffer_size; -void init_cuda_support(const std::vector &graphs); +void init_cuda_support(const std::vector &graphs, const std::vector &local_gpus); -void fini_cuda_support(); +void fini_cuda_support(const std::vector &local_gpus); -void execute_kernel_compute_cuda(const Kernel &kernel, char *scratch_ptr, size_t scratch_bytes); +void execute_kernel_compute_cuda(const Kernel &kernel, char *scratch_ptr, size_t scratch_bytes, int gpu_id); + +void execute_kernel_compute_cuda(const Kernel &kernel, char *scratch_ptr, size_t scratch_bytes, char *device_ptr, size_t device_bytes); #endif \ No newline at end of file diff --git a/mpi_gpu/nonblock_gpu.cc b/mpi_gpu/nonblock_gpu.cc index 2e45e6d3..7f62c7cb 100644 --- a/mpi_gpu/nonblock_gpu.cc +++ b/mpi_gpu/nonblock_gpu.cc @@ -22,7 +22,9 @@ #include "core.h" #include "mpi.h" +#include "cuda_kernel.h" +// Only support one gpu per MPI process int main(int argc, char *argv[]) { MPI_Init(&argc, &argv); @@ -31,6 +33,14 @@ int main(int argc, char *argv[]) MPI_Comm_rank(MPI_COMM_WORLD, &rank); App app(argc, argv); + + int local_rank, local_size; + MPI_Comm MPI_COMM_LOCAL; + MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0 /* key */, MPI_INFO_NULL, &MPI_COMM_LOCAL); + MPI_Comm_rank(MPI_COMM_LOCAL, &local_rank); + std::vector local_gpus(1, local_rank); + init_cuda_support(app.graphs, local_gpus); + if (rank == 0) app.display(); double elapsed_time = 0.0; @@ -177,7 +187,7 @@ int main(int argc, char *argv[]) graph.execute_point(timestep, point, point_output.data(), point_output.size(), point_input_ptr.data(), point_input_bytes.data(), point_n_inputs, - scratch_ptr, scratch_bytes); + scratch_ptr, scratch_bytes, 0); } } } @@ -195,5 +205,7 @@ int main(int argc, char *argv[]) app.report_timing(elapsed_time); } + fini_cuda_support(local_gpus); + MPI_Finalize(); } diff --git a/realm_gpu/Makefile b/realm_gpu/Makefile new file mode 100644 index 00000000..3070d44d --- /dev/null +++ b/realm_gpu/Makefile @@ -0,0 +1,58 @@ +# Copyright 2020 Stanford University +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +ifndef LG_RT_DIR +$(error LG_RT_DIR variable is not defined, aborting build) +endif + +# Flags for directing the runtime makefile what to include +DEBUG ?= 0 # Include debugging symbols +OUTPUT_LEVEL ?= LEVEL_DEBUG # Compile time logging level +USE_CUDA ?= 0 # Include CUDA support (requires CUDA) +USE_GASNET ?= 0 # Include GASNet support (requires GASNet) +USE_HDF ?= 0 # Include HDF5 support (requires HDF5) +ALT_MAPPERS ?= 0 # Include alternative mappers (not recommended) + +# Put the binary file name here +OUTFILE ?= task_bench +# List all the application source files here +GEN_SRC ?= main.cc # .cc files +GEN_GPU_SRC ?= # .cu files + +# You can modify these variables, some will be appended to by the runtime makefile +INC_FLAGS ?= +CC_FLAGS ?= +NVCC_FLAGS ?= +GASNET_FLAGS ?= +LD_FLAGS ?= + +CC_FLAGS += -std=c++11 -I../core +LD_FLAGS += -L../core -lcore_s + +include ../core/make_blas.mk +include ./make_cuda_realm.mk +########################################################################### +# +# Don't change anything below here +# +########################################################################### + +include $(LG_RT_DIR)/runtime.mk + +# Make absolutely sure application code is compiled with -O3 (in non-debug mode) +ifneq ($(strip $(DEBUG)),1) +$(GEN_OBJS) : CC_FLAGS += -O3 +endif diff --git a/realm_gpu/main.cc b/realm_gpu/main.cc new file mode 100644 index 00000000..eb4ca31d --- /dev/null +++ b/realm_gpu/main.cc @@ -0,0 +1,1171 @@ +/* Copyright 2020 Stanford University + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "main.h" + +#include +#include +#include + +#include + +#include "realm.h" + +#include "core.h" + +#include +#include +#include + +using namespace Realm; +using namespace Realm::Serialization; + +TYPE_IS_SERIALIZABLE(TaskGraph); + +// Hack: Track argc/argv in globals to avoid needing to serialize arguments to top_level task. +static int global_argc = 0; +static char **global_argv = NULL; + +enum { + TOP_LEVEL_TASK = Processor::TASK_ID_FIRST_AVAILABLE, + SHARD_TASK, + LEAF_TASK, +}; + +enum { + REDOP_MIN = 11, + REDOP_MAX = 12, +}; + +enum { + FID_FIRST = 100, +}; + +#define DECLARE_REDUCTION(CLASS, T, U, APPLY_OP, FOLD_OP, ID) \ + class CLASS { \ + public: \ + typedef T LHS, RHS; \ + template \ + static void apply(LHS &lhs, RHS rhs); \ + template \ + static void fold(RHS &rhs1, RHS rhs2); \ + static const T identity; \ + }; \ + \ + const T CLASS::identity = ID; \ + \ + template <> \ + void CLASS::apply(LHS & lhs, RHS rhs) \ + { \ + lhs = APPLY_OP(lhs, rhs); \ + } \ + \ + template <> \ + void CLASS::apply(LHS & lhs, RHS rhs) \ + { \ + volatile U *target = (U *)&(lhs); \ + union { \ + U as_U; \ + T as_T; \ + } oldval, newval; \ + do { \ + oldval.as_U = *target; \ + newval.as_T = APPLY_OP(oldval.as_T, rhs); \ + } while (!__sync_bool_compare_and_swap(target, oldval.as_U, newval.as_U)); \ + } \ + \ + template <> \ + void CLASS::fold(RHS & rhs1, RHS rhs2) \ + { \ + rhs1 = FOLD_OP(rhs1, rhs2); \ + } \ + \ + template <> \ + void CLASS::fold(RHS & rhs1, RHS rhs2) \ + { \ + volatile U *target = (U *)&rhs1; \ + union { \ + U as_U; \ + T as_T; \ + } oldval, newval; \ + do { \ + oldval.as_U = *target; \ + newval.as_T = FOLD_OP(oldval.as_T, rhs2); \ + } while (!__sync_bool_compare_and_swap(target, oldval.as_U, newval.as_U)); \ + } + +DECLARE_REDUCTION(RedopMin, unsigned long long, unsigned long long, + std::min, std::min, std::numeric_limits::max()) +DECLARE_REDUCTION(RedopMax, unsigned long long, unsigned long long, + std::max, std::max, std::numeric_limits::min()) + +#undef DECLARE_REDUCTION + +Event copy(RegionInstance src_inst, RegionInstance dst_inst, FieldID fid, + size_t value_size, Event wait_for) +{ + CopySrcDstField src_field; + src_field.inst = src_inst; + src_field.field_id = fid; + src_field.size = value_size; + + std::vector src_fields; + src_fields.push_back(src_field); + + CopySrcDstField dst_field; + dst_field.inst = dst_inst; + dst_field.field_id = fid; + dst_field.size = value_size; + + std::vector dst_fields; + dst_fields.push_back(dst_field); + + return dst_inst.get_indexspace<1, coord_t>().copy( + src_fields, dst_fields, ProfilingRequestSet(), wait_for); +} + +Event fill(RegionInstance dst_inst, FieldID fid, + const void *value, size_t value_size, + Event wait_for) +{ + CopySrcDstField dst_field; + dst_field.inst = dst_inst; + dst_field.field_id = fid; + dst_field.size = value_size; + + std::vector dst_fields; + dst_fields.push_back(dst_field); + return dst_inst.get_indexspace<1, coord_t>().fill( + dst_fields, ProfilingRequestSet(), value, value_size, wait_for); +} + +char *get_base(RegionInstance inst, FieldID fid) +{ + AffineAccessor acc = + AffineAccessor(inst, fid); + return reinterpret_cast( + acc.ptr(inst.get_indexspace<1, coord_t>().bounds.lo)); +} + +void leaf_task(const void *args, size_t arglen, const void *userdata, + size_t userlen, Processor p) +{ + LeafArgs a; + std::vector input_ptr; + std::vector input_bytes; + { + FixedBufferDeserializer ser(args, arglen); + ser >> a; + ser >> input_ptr; + ser >> input_bytes; + assert(ser.bytes_left() == 0); + } + + // std::cout << "leaf_task: " << " point " << a.point << " timestep " << a.timestep << std::endl; + a.graph.execute_point(a.timestep, a.point, + a.output_ptr, a.output_bytes, + (const char **)input_ptr.data(), (size_t *)input_bytes.data(), a.n_inputs, + a.scratch_ptr, a.scratch_bytes, a.device_ptr, a.device_bytes); +} + +void shard_task(const void *args, size_t arglen, const void *userdata, + size_t userlen, Processor p) +{ + ShardArgs a; + std::vector graphs; + std::vector > task_results; + std::vector > task_inputs; + std::vector > raw_exchange; + std::vector > war_exchange; + { + FixedBufferDeserializer ser(args, arglen); + ser >> a; + ser >> graphs; + ser >> task_results; + ser >> task_inputs; + ser >> raw_exchange; + ser >> war_exchange; + + assert(ser.bytes_left() == 0); + } + + long proc_index = a.proc_index; + long num_procs = a.num_procs; + long num_fields = a.num_fields; + long force_copies = a.force_copies; + Memory gpumem = a.gpumem; + Memory zcopymem = a.zcopymem; + Barrier sync = a.sync; + Barrier first_start = a.first_start; + Barrier last_start = a.last_start; + Barrier first_stop = a.first_stop; + Barrier last_stop = a.last_stop; + + RegionInstance device_inst; + size_t device_bytes; + char *device_ptr; + { + std::map field_sizes; + field_sizes[FID_FIRST] = sizeof(char); + + int nb_blocks = graphs[0].kernel.nb_blocks; + int threads_per_block = graphs[0].kernel.threads_per_block; + int cuda_unroll = graphs[0].kernel.cuda_unroll; + device_bytes = nb_blocks * threads_per_block * sizeof(double); + Rect1 bounds(Point1(0), Point1(device_bytes * cuda_unroll)); + RegionInstance::create_instance(device_inst, a.gpumem, bounds, field_sizes, + 0 /*SOA*/, ProfilingRequestSet()).wait(); + device_ptr = get_base(device_inst, FID_FIRST); + } + // Figure out who we're going to be communicating with. + + // graph -> point -> [remote point] + std::vector > > raw_exchange_points(graphs.size()); + std::vector > > war_exchange_points(graphs.size()); + + // graph -> [remote point] + std::vector > raw_all_points(graphs.size()); + std::vector > war_all_points(graphs.size()); + for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) { + auto graph = graphs.at(graph_index); + + long first_point = proc_index * graph.max_width / num_procs; + long last_point = (proc_index + 1) * graph.max_width / num_procs - 1; + + raw_exchange_points.at(graph_index).resize(last_point - first_point + 1); + war_exchange_points.at(graph_index).resize(last_point - first_point + 1); + + for (long point = first_point; point <= last_point; ++point) { + long max_dset = graph.max_dependence_sets(); + for (long dset = 0; dset < max_dset; ++dset) { + for (auto interval : graph.dependencies(dset, point)) { + for (long dep = interval.first; dep <= interval.second; ++dep) { + raw_exchange_points.at(graph_index).at(point - first_point).insert(dep); + raw_all_points.at(graph_index).insert(dep); + } + } + for (auto interval : graph.reverse_dependencies(dset, point)) { + for (long dep = interval.first; dep <= interval.second; ++dep) { + war_exchange_points.at(graph_index).at(point - first_point).insert(dep); + war_all_points.at(graph_index).insert(dep); + } + } + } + } + } + + // graph -> point -> dset -> [remote point] + std::vector > > > raw_points_not_in_dset(graphs.size()); + std::vector > > > war_points_not_in_dset(graphs.size()); + for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) { + auto graph = graphs.at(graph_index); + + long first_point = proc_index * graph.max_width / num_procs; + long last_point = (proc_index + 1) * graph.max_width / num_procs - 1; + + raw_points_not_in_dset.at(graph_index).resize(last_point - first_point + 1); + war_points_not_in_dset.at(graph_index).resize(last_point - first_point + 1); + + for (long point = first_point; point <= last_point; ++point) { + long max_dset = graph.max_dependence_sets(); + + raw_points_not_in_dset.at(graph_index).at(point - first_point).resize(max_dset); + war_points_not_in_dset.at(graph_index).at(point - first_point).resize(max_dset); + + for (long dset = 0; dset < max_dset; ++dset) { + std::set raw_points = raw_exchange_points.at(graph_index).at(point - first_point); + std::set war_points = war_exchange_points.at(graph_index).at(point - first_point); + + for (auto interval : graph.dependencies(dset, point)) { + for (long dep = interval.first; dep <= interval.second; ++dep) { + raw_points.erase(dep); + } + } + for (auto interval : graph.reverse_dependencies(dset, point)) { + for (long dep = interval.first; dep <= interval.second; ++dep) { + war_points.erase(dep); + } + } + + raw_points_not_in_dset.at(graph_index).at(point - first_point).at(dset).assign(raw_points.begin(), raw_points.end()); + war_points_not_in_dset.at(graph_index).at(point - first_point).at(dset).assign(war_points.begin(), war_points.end()); + } + } + } + + // Create local input regions. + + // graph -> point -> max deps + std::vector > max_deps(graphs.size()); + long all_max_deps = 0; + for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) { + auto graph = graphs.at(graph_index); + + max_deps.at(graph_index).resize(graph.max_width, 0); + + for (long point = 0; point < graph.max_width; ++point) { + long max_dset = graph.max_dependence_sets(); + for (long dset = 0; dset < max_dset; ++dset) { + long deps = 0; + for (auto interval : graph.dependencies(dset, point)) { + deps += interval.second - interval.first + 1; + } + max_deps.at(graph_index).at(point) = std::max(max_deps.at(graph_index).at(point), deps); + all_max_deps = std::max(all_max_deps, deps); + } + } + } + + // graph -> point -> dep -> instance + std::vector > > inputs(graphs.size()); + // graph -> point -> dep -> field -> base pointer + std::vector > > > input_base(graphs.size()); + for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) { + auto graph = graphs.at(graph_index); + + Rect1 bounds(Point1(0), Point1(graph.output_bytes_per_task - 1)); + + std::map field_sizes; + for (unsigned fid = FID_FIRST; fid < FID_FIRST + num_fields; ++fid) { + field_sizes[fid] = sizeof(char); + } + + long first_point = proc_index * graph.max_width / num_procs; + long last_point = (proc_index + 1) * graph.max_width / num_procs - 1; + + inputs.at(graph_index).resize(last_point - first_point + 1); + input_base.at(graph_index).resize(last_point - first_point + 1); + + for (long point = first_point; point <= last_point; ++point) { + long deps = max_deps.at(graph_index).at(point); + + inputs.at(graph_index).at(point - first_point).resize(deps); + input_base.at(graph_index).at(point - first_point).resize(deps); + + AffineAccessor task_input = + AffineAccessor(task_inputs.at(graph_index).at(point), FID_FIRST); + + for (long dep = 0; dep < deps; ++dep) { + RegionInstance &inst = inputs.at(graph_index).at(point - first_point).at(dep); + RegionInstance::create_instance(inst, zcopymem, bounds, field_sizes, + 0 /*SOA*/, ProfilingRequestSet()) + .wait(); + + input_base.at(graph_index).at(point - first_point).at(dep).resize(num_fields); + for (long fid = FID_FIRST; fid < FID_FIRST + num_fields; ++fid) { + input_base.at(graph_index).at(point - first_point).at(dep).at(fid - FID_FIRST) = + get_base(inst, fid); + } + + task_input[dep] = inst; + } + for (long dep = deps; dep < all_max_deps; ++dep) { + task_input[dep] = RegionInstance::NO_INST; + } + } + } + + // Create barriers. + + // graph -> point -> field -> remote point -> barrier + std::vector > > > raw_in(graphs.size()); + std::vector > > > war_in(graphs.size()); + for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) { + auto graph = graphs.at(graph_index); + + long first_point = proc_index * graph.max_width / num_procs; + long last_point = (proc_index + 1) * graph.max_width / num_procs - 1; + + raw_in.at(graph_index).resize(last_point - first_point + 1); + war_in.at(graph_index).resize(last_point - first_point + 1); + + for (long point = first_point; point <= last_point; ++point) { + auto &raw_points = raw_exchange_points.at(graph_index).at(point - first_point); + auto &war_points = war_exchange_points.at(graph_index).at(point - first_point); + + raw_in.at(graph_index).at(point - first_point).resize(num_fields); + war_in.at(graph_index).at(point - first_point).resize(num_fields); + + for (long fid = FID_FIRST; fid < FID_FIRST + num_fields; ++fid) { + const Barrier no_barrier = Barrier::NO_BARRIER; + fill(raw_exchange.at(graph_index).at(point), fid, + &no_barrier, sizeof(no_barrier), Event::NO_EVENT) + .wait(); + fill(war_exchange.at(graph_index).at(point), fid, + &no_barrier, sizeof(no_barrier), Event::NO_EVENT) + .wait(); + + AffineAccessor raw = + AffineAccessor(raw_exchange.at(graph_index).at(point), fid); + AffineAccessor war = + AffineAccessor(war_exchange.at(graph_index).at(point), fid); + + for (auto dep : raw_points) { + Barrier bar = Barrier::create_barrier(1); + raw_in.at(graph_index).at(point - first_point).at(fid - FID_FIRST)[dep] = bar; + raw[dep] = bar; + } + for (auto dep : war_points) { + Barrier bar = Barrier::create_barrier(1); + war_in.at(graph_index).at(point - first_point).at(fid - FID_FIRST)[dep] = bar; + war[dep] = bar; + } + } + } + } + + // Perform input region/barrier exchange. + sync.arrive(1); + sync.wait(); + sync = sync.advance_barrier(); + + // graph -> remote point -> instance + std::vector > raw_local_out(graphs.size()); + std::vector > war_local_out(graphs.size()); + for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) { + auto graph = graphs.at(graph_index); + + Rect1 bounds(Point1(0), Point1(graph.max_width - 1)); + + std::map field_sizes; + for (unsigned fid = FID_FIRST; fid < FID_FIRST + num_fields; ++fid) { + field_sizes[fid] = sizeof(Barrier); + } + + for (auto &dep : war_all_points.at(graph_index)) { + RegionInstance inst; + RegionInstance::create_instance(inst, zcopymem, bounds, field_sizes, + 0 /*SOA*/, ProfilingRequestSet()) + .wait(); + raw_local_out.at(graph_index)[dep] = inst; + } + + for (auto &dep : raw_all_points.at(graph_index)) { + RegionInstance inst; + RegionInstance::create_instance(inst, zcopymem, bounds, field_sizes, + 0 /*SOA*/, ProfilingRequestSet()) + .wait(); + war_local_out.at(graph_index)[dep] = inst; + } + } + + std::vector events; + for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) { + for (auto &dep : war_all_points.at(graph_index)) { + for (long fid = FID_FIRST; fid < FID_FIRST + num_fields; ++fid) { + events.push_back( + copy(raw_exchange.at(graph_index).at(dep), + raw_local_out.at(graph_index).at(dep), + fid, sizeof(Barrier), Event::NO_EVENT)); + } + } + + for (auto &dep : raw_all_points.at(graph_index)) { + for (long fid = FID_FIRST; fid < FID_FIRST + num_fields; ++fid) { + events.push_back( + copy(war_exchange.at(graph_index).at(dep), + war_local_out.at(graph_index).at(dep), + fid, sizeof(Barrier), Event::NO_EVENT)); + } + } + } + Event::merge_events(events).wait(); + events.clear(); + + // graph -> point -> field -> remote point -> barrier + std::vector > > > raw_out(graphs.size()); + std::vector > > > war_out(graphs.size()); + for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) { + auto graph = graphs.at(graph_index); + + long first_point = proc_index * graph.max_width / num_procs; + long last_point = (proc_index + 1) * graph.max_width / num_procs - 1; + + raw_out.at(graph_index).resize(last_point - first_point + 1); + war_out.at(graph_index).resize(last_point - first_point + 1); + + for (long point = first_point; point <= last_point; ++point) { + auto &raw_points = raw_exchange_points.at(graph_index).at(point - first_point); + auto &war_points = war_exchange_points.at(graph_index).at(point - first_point); + + raw_out.at(graph_index).at(point - first_point).resize(num_fields); + war_out.at(graph_index).at(point - first_point).resize(num_fields); + + for (long fid = FID_FIRST; fid < FID_FIRST + num_fields; ++fid) { + for (auto dep : war_points) { + AffineAccessor raw = + AffineAccessor(raw_local_out.at(graph_index).at(dep), fid); + Barrier bar = raw[point]; + assert(bar != Barrier::NO_BARRIER); + raw_out.at(graph_index).at(point - first_point).at(fid - FID_FIRST)[dep] = bar; + } + for (auto dep : raw_points) { + AffineAccessor war = + AffineAccessor(war_local_out.at(graph_index).at(dep), fid); + Barrier bar = war[point]; + assert(bar != Barrier::NO_BARRIER); + war_out.at(graph_index).at(point - first_point).at(fid - FID_FIRST)[dep] = bar; + } + } + } + } + + // graph -> remote point -> instance + std::vector > remote_task_inputs(graphs.size()); + for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) { + Rect1 bounds(Point1(0), Point1(all_max_deps - 1)); + + std::map field_sizes; + field_sizes[FID_FIRST] = sizeof(RegionInstance); + + for (auto &dep : war_all_points.at(graph_index)) { + RegionInstance inst; + RegionInstance::create_instance(inst, zcopymem, bounds, field_sizes, + 0 /*SOA*/, ProfilingRequestSet()) + .wait(); + remote_task_inputs.at(graph_index)[dep] = inst; + } + } + + for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) { + for (auto &dep : war_all_points.at(graph_index)) { + events.push_back( + copy(task_inputs.at(graph_index).at(dep), + remote_task_inputs.at(graph_index).at(dep), + FID_FIRST, sizeof(RegionInstance), Event::NO_EVENT)); + } + } + Event::merge_events(events).wait(); + events.clear(); + + // graph -> point -> remote point -> dep -> instance + std::vector > > > remote_inputs(graphs.size()); + for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) { + auto graph = graphs.at(graph_index); + + long first_point = proc_index * graph.max_width / num_procs; + long last_point = (proc_index + 1) * graph.max_width / num_procs - 1; + + remote_inputs.at(graph_index).resize(last_point - first_point + 1); + + for (long point = first_point; point <= last_point; ++point) { + auto &war_points = war_exchange_points.at(graph_index).at(point - first_point); + + for (auto dep : war_points) { + remote_inputs.at(graph_index).at(point - first_point)[dep].resize(all_max_deps); + + AffineAccessor remote_task_input = + AffineAccessor(remote_task_inputs.at(graph_index).at(dep), FID_FIRST); + + for (long remote_dep = 0; remote_dep < all_max_deps; ++remote_dep) { + RegionInstance inst = remote_task_input[remote_dep]; + remote_inputs.at(graph_index).at(point - first_point)[dep].at(remote_dep) = inst; + } + } + } + } + + // graph -> point -> dset -> remote point -> slot + std::vector > > > remote_input_slot(graphs.size()); + for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) { + auto graph = graphs.at(graph_index); + + long first_point = proc_index * graph.max_width / num_procs; + long last_point = (proc_index + 1) * graph.max_width / num_procs - 1; + + remote_input_slot.at(graph_index).resize(last_point - first_point + 1); + + for (long point = first_point; point <= last_point; ++point) { + long max_dset = graph.max_dependence_sets(); + + remote_input_slot.at(graph_index).at(point - first_point).resize(max_dset); + + for (long dset = 0; dset < max_dset; ++dset) { + for (auto interval : graph.reverse_dependencies(dset, point)) { + for (long dep = interval.first; dep <= interval.second; ++dep) { + long slot = 0; + bool found = false; + for (auto interval : graph.dependencies(dset, dep)) { + for (long dep = interval.first; dep <= interval.second; ++dep) { + if (point == dep) { + found = true; + break; + } + slot++; + } + if (found) break; + } + assert(found); + remote_input_slot.at(graph_index).at(point - first_point).at(dset)[dep] = slot; + } + } + } + } + } + + // graph -> point -> field -> base pointer + std::vector > > result_base(graphs.size()); + for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) { + auto graph = graphs.at(graph_index); + + result_base.at(graph_index).resize(graph.max_width); + + for (long point = 0; point < graph.max_width; ++point) { + + result_base.at(graph_index).at(point).resize(num_fields, NULL); + + auto &inst = task_results.at(graph_index).at(point); + if (inst.get_location() == gpumem || inst.get_location() == zcopymem) { + for (long fid = FID_FIRST; fid < FID_FIRST + num_fields; ++fid) { + result_base.at(graph_index).at(point).at(fid - FID_FIRST) = + get_base(inst, fid); + } + } + } + } + + std::vector input_ptr(all_max_deps, 0); + std::vector input_bytes(all_max_deps, 0); + + size_t max_scratch_bytes = 0; + for (auto graph : graphs) { + max_scratch_bytes = std::max(max_scratch_bytes, graph.scratch_bytes_per_task); + } + + // It's ok to use only a single scratch buffer because the tasks + // will be effectively serialized on this processor. + char *scratch_ptr = NULL; + cudaHostAlloc((void**)&(scratch_ptr), max_scratch_bytes, 0); + assert(scratch_ptr); + TaskGraph::prepare_scratch(scratch_ptr, max_scratch_bytes); + + // Statically allocate buffer to use for task input + size_t leaf_bufsize = 0; + { + LeafArgs leaf_args; + ByteCountSerializer ser; + ser << leaf_args; + ser << input_ptr; + ser << input_bytes; + leaf_bufsize = ser.bytes_used(); + } + void *leaf_buffer = malloc(leaf_bufsize); + assert(leaf_buffer); + + // Sync again to avoid staggered start + sync.arrive(1); + sync.wait(); + sync = sync.advance_barrier(); + + // Main loop + unsigned long long start_time = 0, stop_time = 0; + std::vector preconditions; + std::vector > > copy_postconditions; + for (long rep = 0; rep < 1; ++rep) { + start_time = Clock::current_time_in_nanoseconds(); + for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) { + auto graph = graphs.at(graph_index); + + std::fill(input_bytes.begin(), input_bytes.end(), graph.output_bytes_per_task); + + long first_point = proc_index * graph.max_width / num_procs; + long last_point = (proc_index + 1) * graph.max_width / num_procs - 1; + + // Don't leak copy postconditions between graphs. + copy_postconditions.resize(last_point - first_point + 1); + for (auto &point_postconditions : copy_postconditions) { + point_postconditions.resize(num_fields); + for (auto &postconditions : point_postconditions) { + postconditions.clear(); + } + } + + for (long timestep = 0; timestep < graph.timesteps; ++timestep) { + long dset = graph.dependence_set_at_timestep(timestep); + long next_dset = graph.dependence_set_at_timestep(timestep + 1); + long last_field_dset = graph.dependence_set_at_timestep(std::max(timestep - num_fields + 1, 0L)); + + long offset = graph.offset_at_timestep(timestep); + long width = graph.width_at_timestep(timestep); + + long last_offset = graph.offset_at_timestep(timestep-1); + long last_width = graph.width_at_timestep(timestep-1); + + long last_field_offset = graph.offset_at_timestep(timestep - num_fields + 1); + long last_field_width = graph.width_at_timestep(timestep - num_fields + 1); + + long next_offset = timestep + 1 < graph.timesteps ? graph.offset_at_timestep(timestep+1) : 0; + long next_width = timestep + 1 < graph.timesteps ? graph.width_at_timestep(timestep+1) : 0; + + long fid = FID_FIRST + timestep % num_fields; + long last_fid = FID_FIRST + (timestep + num_fields - 1) % num_fields; + + for (long point = first_point; point <= last_point; ++point) { + // Gather inputs + long n_inputs = 0, slot = 0; + preconditions.clear(); + preconditions.insert(preconditions.begin(), + copy_postconditions.at(point - first_point).at(fid - FID_FIRST).begin(), + copy_postconditions.at(point - first_point).at(fid - FID_FIRST).end()); + const auto &dset_deps = graph.dependencies(dset, point); + for (auto interval : dset_deps) { + for (long dep = interval.first; dep <= interval.second; ++dep) { + Barrier &ready = raw_in.at(graph_index).at(point - first_point).at(last_fid - FID_FIRST).at(dep); + preconditions.push_back(ready.get_previous_phase()); + + if (dep >= last_offset && dep < last_offset + last_width) { + char *data = result_base.at(graph_index).at(dep).at(last_fid - FID_FIRST); + if (point >= offset && point < offset + width) { + if (data && !force_copies) { + // Data available locally + } else { + // Data is remote + data = input_base.at(graph_index).at(point - first_point).at(slot).at(last_fid - FID_FIRST); + } + } + input_ptr.at(n_inputs) = reinterpret_cast(data); + n_inputs++; + } + slot++; + } + } + + // Dependencies can occur in one of two ways: + // 1. The dependent task is local, so copy is not necessary. + // (In this case the dependency catches on the task itself.) + // 2. The dependent task is non-local, so copy is necessary. + // (In this case the dependency catches on the copy.) + + // WAR dependencies (part 1) + for (auto interval : graph.reverse_dependencies(last_field_dset, point)) { + for (long dep = interval.first; dep <= interval.second; ++dep) { + if (dep >= last_field_offset && dep < last_field_offset + last_field_width) { + // Only copy when the dependent task doesn't live in the same address space. + if (!force_copies && result_base.at(graph_index).at(dep).at(last_fid - FID_FIRST)) { + Barrier &ready = war_in.at(graph_index).at(point - first_point).at(fid - FID_FIRST).at(dep); + preconditions.push_back(ready.get_previous_phase()); + } + } + } + } + + // WAR dependencies (part 2) + const auto &next_dset_rev_deps = graph.reverse_dependencies(next_dset, point); + for (auto interval : next_dset_rev_deps) { + for (long dep = interval.first; dep <= interval.second; ++dep) { + if (force_copies || !result_base.at(graph_index).at(dep).at(last_fid - FID_FIRST)) { + Barrier &ready = war_in.at(graph_index).at(point - first_point).at(fid - FID_FIRST).at(dep); + preconditions.push_back(ready.get_previous_phase()); + } + } + } + + // Launch task + Event task_postcondition = Event::NO_EVENT; + if (point >= offset && point < offset + width) { + LeafArgs leaf_args; + leaf_args.point = point; + leaf_args.timestep = timestep; + leaf_args.graph = graph; + leaf_args.output_ptr = result_base.at(graph_index).at(point).at(fid - FID_FIRST); + leaf_args.output_bytes = graph.output_bytes_per_task; + leaf_args.scratch_ptr = scratch_ptr; + leaf_args.scratch_bytes = graph.scratch_bytes_per_task; + leaf_args.n_inputs = n_inputs; + leaf_args.device_ptr = device_ptr; + leaf_args.device_bytes = device_bytes; + + FixedBufferSerializer ser(leaf_buffer, leaf_bufsize); + ser << leaf_args; + ser << input_ptr; + ser << input_bytes; + assert(ser.bytes_left() == 0); + + task_postcondition = + p.spawn(LEAF_TASK, leaf_buffer, leaf_bufsize, + Event::merge_events(preconditions)); + + // FIXME: Figure out which tasks we actually need to wait on + events.push_back(task_postcondition); + } + + copy_postconditions.at(point - first_point).at(fid - FID_FIRST).clear(); + + // RAW dependencies + for (auto interval : next_dset_rev_deps) { + for (long dep = interval.first; dep <= interval.second; ++dep) { + Barrier &complete = raw_out.at(graph_index).at(point - first_point).at(fid - FID_FIRST).at(dep); + Event postcondition = task_postcondition; + if (dep >= next_offset && dep < next_offset + next_width) { + // Only copy when the dependent task doesn't live in the same address space. + if (force_copies || !result_base.at(graph_index).at(dep).at(fid - FID_FIRST)) { + long slot = remote_input_slot.at(graph_index).at(point - first_point).at(next_dset).at(dep); + postcondition = copy( + task_results.at(graph_index).at(point), + remote_inputs + .at(graph_index) + .at(point - first_point) + .at(dep) + .at(slot), + fid, graph.output_bytes_per_task, + task_postcondition); + copy_postconditions.at(point - first_point).at(fid - FID_FIRST).push_back(postcondition); + } + } + complete.arrive(1, postcondition); + } + } + // Also need to arrive at any points not included in this + // dset, otherwise we'll deadlock. + for (long dep : war_points_not_in_dset.at(graph_index).at(point - first_point).at(next_dset)) { + raw_out.at(graph_index).at(point - first_point).at(fid - FID_FIRST).at(dep).arrive(1); + } + + // WAR dependencies + for (auto interval : dset_deps) { + for (long dep = interval.first; dep <= interval.second; ++dep) { + Barrier &complete = war_out.at(graph_index).at(point - first_point).at(last_fid - FID_FIRST).at(dep); + complete.arrive(1, task_postcondition); + } + } + // Also need to arrive at any points not included in this + // dset, otherwise we'll deadlock. + for (long dep : raw_points_not_in_dset.at(graph_index).at(point - first_point).at(dset)) { + war_out.at(graph_index).at(point - first_point).at(last_fid - FID_FIRST).at(dep).arrive(1, task_postcondition); + } + + for (auto &bar : raw_in.at(graph_index).at(point - first_point).at(fid - FID_FIRST)) { + bar.second = bar.second.advance_barrier(); + } + + for (auto &bar : raw_out.at(graph_index).at(point - first_point).at(fid - FID_FIRST)) { + bar.second = bar.second.advance_barrier(); + } + + for (auto &bar : war_in.at(graph_index).at(point - first_point).at(last_fid - FID_FIRST)) { + bar.second = bar.second.advance_barrier(); + } + + for (auto &bar : war_out.at(graph_index).at(point - first_point).at(last_fid - FID_FIRST)) { + bar.second = bar.second.advance_barrier(); + } + + } + } + } + + Event::merge_events(events).wait(); + events.clear(); + + stop_time = Clock::current_time_in_nanoseconds(); + } + + first_start.arrive(1, Event::NO_EVENT, &start_time, sizeof(start_time)); + last_start.arrive(1, Event::NO_EVENT, &start_time, sizeof(start_time)); + first_stop.arrive(1, Event::NO_EVENT, &stop_time, sizeof(stop_time)); + last_stop.arrive(1, Event::NO_EVENT, &stop_time, sizeof(stop_time)); + + free(leaf_buffer); + cudaFreeHost(scratch_ptr); + +} + +void top_level_task(const void *args, size_t arglen, const void *userdata, + size_t userlen, Processor p) +{ + assert(global_argv != NULL); + App app(global_argc, global_argv); + auto &graphs = app.graphs; + + long num_fields = 5; + bool force_copies = false; + for (int i = 1; i < global_argc; i++) { + if (!strcmp(global_argv[i], "-field")) { + long value = atol(global_argv[++i]); + if (value <= 0) { + fprintf(stderr, "error: Invalid flag \"-field %ld\" must be > 1\n", value); + abort(); + } + num_fields = value; + } + + if (!strcmp(global_argv[i], "-force-copies")) { + force_copies = true; + } + } + + app.display(); + + Machine machine = Machine::get_machine(); + + std::vector procs; + { + Machine::ProcessorQuery query(machine); + query.only_kind(Processor::TOC_PROC); + procs.insert(procs.end(), query.begin(), query.end()); + } + long num_procs = procs.size(); + + std::map proc_zcopymems; + std::map proc_gpumems; + { + std::vector proc_mem_affinities; + machine.get_proc_mem_affinity(proc_mem_affinities); + + for (size_t i = 0; i < proc_mem_affinities.size(); ++i) { + Machine::ProcessorMemoryAffinity &affinity = proc_mem_affinities[i]; + std::cout << "proc: " << i << ", p:" << affinity.p.kind() << ", m:" << affinity.m.kind() << std::endl; + if (affinity.p.kind() == Processor::TOC_PROC) { + if (affinity.m.kind() == Memory::Z_COPY_MEM) { + proc_zcopymems[affinity.p] = affinity.m; + } else if (affinity.m.kind() == Memory::GPU_FB_MEM) + proc_gpumems[affinity.p] = affinity.m; + } + } + } + + long all_max_deps = 0; + for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) { + auto graph = graphs.at(graph_index); + + for (long point = 0; point <= graph.max_width; ++point) { + long max_dset = graph.max_dependence_sets(); + for (long dset = 0; dset < max_dset; ++dset) { + long deps = 0; + for (auto interval : graph.dependencies(dset, point)) { + deps += interval.second - interval.first + 1; + } + all_max_deps = std::max(all_max_deps, deps); + } + } + } + + std::vector events; + std::vector > task_results(graphs.size()); + { + std::map field_sizes; + for (unsigned fid = FID_FIRST; fid < FID_FIRST + num_fields; ++fid) { + field_sizes[fid] = sizeof(char); + } + + for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) { + auto graph = graphs.at(graph_index); + + task_results.at(graph_index).resize(graph.max_width); + + Rect1 bounds(Point1(0), Point1(graph.output_bytes_per_task - 1)); + + for (long proc_index = 0; proc_index < num_procs; ++proc_index) { + auto proc = procs.at(proc_index); + auto memory = proc_zcopymems.at(proc); + + long first_point = proc_index * graph.max_width / num_procs; + long last_point = (proc_index + 1) * graph.max_width / num_procs - 1; + for (long point = first_point; point <= last_point; ++point) { + events.push_back( + RegionInstance::create_instance(task_results.at(graph_index).at(point), memory, bounds, field_sizes, + 0 /*SOA*/, ProfilingRequestSet())); + } + } + } + } + + std::vector > task_inputs(graphs.size()); + { + std::map field_sizes; + field_sizes[FID_FIRST] = sizeof(RegionInstance); + + for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) { + auto graph = graphs.at(graph_index); + + task_inputs.at(graph_index).resize(graph.max_width); + + Rect1 bounds(Point1(0), Point1(all_max_deps - 1)); + + for (long proc_index = 0; proc_index < num_procs; ++proc_index) { + auto proc = procs.at(proc_index); + auto memory = proc_zcopymems.at(proc); + + long first_point = proc_index * graph.max_width / num_procs; + long last_point = (proc_index + 1) * graph.max_width / num_procs - 1; + for (long point = first_point; point <= last_point; ++point) { + events.push_back( + RegionInstance::create_instance(task_inputs.at(graph_index).at(point), memory, bounds, field_sizes, + 0 /*SOA*/, ProfilingRequestSet())); + } + } + } + } + + std::vector > raw_exchange(graphs.size()); + std::vector > war_exchange(graphs.size()); + { + std::map field_sizes; + for (unsigned fid = FID_FIRST; fid < FID_FIRST + num_fields; ++fid) { + field_sizes[fid] = sizeof(Barrier); + } + + for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) { + auto graph = graphs.at(graph_index); + + raw_exchange.at(graph_index).resize(graph.max_width); + war_exchange.at(graph_index).resize(graph.max_width); + + Rect1 bounds(Point1(0), Point1(graph.max_width - 1)); + + for (long proc_index = 0; proc_index < num_procs; ++proc_index) { + auto proc = procs.at(proc_index); + auto memory = proc_zcopymems.at(proc); + + long first_point = proc_index * graph.max_width / num_procs; + long last_point = (proc_index + 1) * graph.max_width / num_procs - 1; + for (long point = first_point; point <= last_point; ++point) { + events.push_back( + RegionInstance::create_instance(raw_exchange.at(graph_index).at(point), memory, bounds, field_sizes, + 0 /*SOA*/, ProfilingRequestSet())); + events.push_back( + RegionInstance::create_instance(war_exchange.at(graph_index).at(point), memory, bounds, field_sizes, + 0 /*SOA*/, ProfilingRequestSet())); + } + } + } + } + + Event::merge_events(events).wait(); + events.clear(); + + Barrier sync_bar = Barrier::create_barrier(num_procs); + + Barrier first_start_bar = + Barrier::create_barrier(num_procs, REDOP_MIN, &RedopMin::identity, + sizeof(RedopMin::identity)); + Barrier last_start_bar = + Barrier::create_barrier(num_procs, REDOP_MAX, &RedopMax::identity, + sizeof(RedopMax::identity)); + Barrier first_stop_bar = + Barrier::create_barrier(num_procs, REDOP_MIN, &RedopMin::identity, + sizeof(RedopMin::identity)); + Barrier last_stop_bar = + Barrier::create_barrier(num_procs, REDOP_MAX, &RedopMax::identity, + sizeof(RedopMax::identity)); + + for (long proc_index = 0; proc_index < num_procs; ++proc_index) { + auto proc = procs.at(proc_index); + + ShardArgs args; + args.proc_index = proc_index; + args.num_procs = num_procs; + args.num_fields = num_fields; + args.force_copies = force_copies; + args.gpumem = proc_gpumems[proc]; + args.zcopymem = proc_zcopymems[proc]; + args.sync = sync_bar; + args.first_start = first_start_bar; + args.last_start = last_start_bar; + args.first_stop = first_stop_bar; + args.last_stop = last_stop_bar; + + DynamicBufferSerializer ser(4096); + ser << args; + ser << graphs; + ser << task_results; + ser << task_inputs; + ser << raw_exchange; + ser << war_exchange; + events.push_back( + proc.spawn(SHARD_TASK, ser.get_buffer(), ser.bytes_used())); + } + + Event::merge_events(events).wait(); + events.clear(); + + unsigned long long first_start; + { + first_start_bar.wait(); +#ifndef NDEBUG + bool ok = +#endif + first_start_bar.get_result(&first_start, sizeof(first_start)); + assert(ok); + } + + unsigned long long last_start; + { + last_start_bar.wait(); +#ifndef NDEBUG + bool ok = +#endif + last_start_bar.get_result(&last_start, sizeof(last_start)); + assert(ok); + } + + unsigned long long first_stop; + { + first_stop_bar.wait(); +#ifndef NDEBUG + bool ok = +#endif + first_stop_bar.get_result(&first_stop, sizeof(first_stop)); + assert(ok); + } + + unsigned long long last_stop; + { + last_stop_bar.wait(); +#ifndef NDEBUG + bool ok = +#endif + last_stop_bar.get_result(&last_stop, sizeof(last_stop)); + assert(ok); + } + + app.report_timing((last_stop - first_start)/1e9); +} + +int main(int argc, char **argv) +{ + sleep(20); + Runtime rt; + + rt.init(&argc, &argv); + + rt.register_task(TOP_LEVEL_TASK, top_level_task); + rt.register_task(SHARD_TASK, shard_task); + rt.register_task(LEAF_TASK, leaf_task); + rt.register_reduction(REDOP_MIN); + rt.register_reduction(REDOP_MAX); + + // select a processor to run the top level task on + Processor p = Processor::NO_PROC; + { + Machine::ProcessorQuery query(Machine::get_machine()); + query.only_kind(Processor::LOC_PROC); + p = query.first(); + } + assert(p.exists()); + + global_argc = argc; + global_argv = argv; + + // collective launch of a single task - everybody gets the same finish event + Event e = rt.collective_spawn(p, TOP_LEVEL_TASK, NULL, 0); + // request shutdown once that task is complete + rt.shutdown(e); + + // now sleep this thread until that shutdown actually happens + rt.wait_for_shutdown(); + + return 0; +} diff --git a/realm_gpu/main.h b/realm_gpu/main.h new file mode 100644 index 00000000..f7a1dfd7 --- /dev/null +++ b/realm_gpu/main.h @@ -0,0 +1,70 @@ +/* Copyright 2020 Stanford University + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MAIN_H +#define MAIN_H + +#include "realm.h" + +#include "core.h" +#include "cuda_kernel.h" + +typedef long long int coord_t; + +typedef Realm::Point<1, coord_t> Point1; +typedef Realm::Rect<1, coord_t> Rect1; + +typedef Realm::Point<2, coord_t> Point2; +typedef Realm::Rect<2, coord_t> Rect2; + +struct ShardArgs { +public: + long proc_index; + long num_procs; + long num_fields; + long force_copies; + Realm::Memory gpumem; + Realm::Memory zcopymem; + Realm::Barrier sync; + Realm::Barrier first_start; + Realm::Barrier last_start; + Realm::Barrier first_stop; + Realm::Barrier last_stop; + // std::vector graphs; + // std::vector > task_results; + // std::vector > raw_exchange; + // std::vector > war_exchange; +}; + +TYPE_IS_SERIALIZABLE(ShardArgs); + +struct LeafArgs { +public: + long point, timestep; + TaskGraph graph; + char *output_ptr; + size_t output_bytes; + char *scratch_ptr; + size_t scratch_bytes; + long n_inputs; + char *device_ptr; + size_t device_bytes; + // std::vector input_ptr; + // std::vector input_bytes; +}; + +TYPE_IS_SERIALIZABLE(LeafArgs); + +#endif diff --git a/realm_gpu/make_cuda_realm.mk b/realm_gpu/make_cuda_realm.mk new file mode 100644 index 00000000..c2eae0cc --- /dev/null +++ b/realm_gpu/make_cuda_realm.mk @@ -0,0 +1,13 @@ +HAVE_CUDA ?= 0 + +ifeq ($(strip $(HAVE_CUDA)),1) +ifndef CUDA_ROOT +$(error CUDA_ROOT variable is not defined, aborting build) +endif + +CFLAGS += -DENABLE_CUDA +CXXFLAGS += -DENABLE_CUDA +LDFLAGS += -L../core -lcore_cuda_s -L$(CUDA_ROOT)/lib64 + + +endif diff --git a/realm_gpu/readme b/realm_gpu/readme new file mode 100644 index 00000000..34b7e009 --- /dev/null +++ b/realm_gpu/readme @@ -0,0 +1,5 @@ +export HAVE_CUDA=1 +export CUDA_ROOT=/path/to/cuda +export DEBUG=1 +export USE_CUDA=1 +export LG_RT_DIR=/path/to/legion/runtime