diff --git a/core/core.cc b/core/core.cc
index 2b6c2f0c..61feb5f1 100644
--- a/core/core.cc
+++ b/core/core.cc
@@ -80,11 +80,42 @@ void Kernel::execute(long graph_index, long timestep, long point,
     assert(timestep >= 0 && point >= 0);
     execute_kernel_imbalance(*this, graph_index, timestep, point);
     break;
+  default:
+    assert(false && "unimplemented kernel type");
+  };
+}
+
+void Kernel::execute(long graph_index, long timestep, long point,
+                     char *scratch_ptr, size_t scratch_bytes, int gpu_id) const
+{
+  switch(type) {
+  case KernelType::EMPTY:
+    execute_kernel_empty(*this);
+    break;
 #ifdef ENABLE_CUDA
   case KernelType::CUDA_COMPUTE_BOUND:
     assert(scratch_ptr != NULL);
     assert(scratch_bytes > 0);
-    execute_kernel_compute_cuda(*this, scratch_ptr, scratch_bytes);
+    execute_kernel_compute_cuda(*this, scratch_ptr, scratch_bytes, gpu_id);
+    break; 
+#endif
+  default:
+    assert(false && "unimplemented kernel type");
+  };
+}
+
+void Kernel::execute(long graph_index, long timestep, long point,
+                     char *scratch_ptr, size_t scratch_bytes, char *device_ptr, size_t device_bytes) const
+{
+  switch(type) {
+  case KernelType::EMPTY:
+    execute_kernel_empty(*this);
+    break;
+#ifdef ENABLE_CUDA
+  case KernelType::CUDA_COMPUTE_BOUND:
+    assert(scratch_ptr != NULL);
+    assert(scratch_bytes > 0);
+    execute_kernel_compute_cuda(*this, scratch_ptr, scratch_bytes, device_ptr, device_bytes);
     break; 
 #endif
   default:
@@ -633,6 +664,156 @@ void TaskGraph::execute_point(long timestep, long point,
   k.execute(graph_index, timestep, point, scratch_ptr, scratch_bytes);
 }
 
+void TaskGraph::execute_point(long timestep, long point,
+                              char *output_ptr, size_t output_bytes,
+                              const char **input_ptr, const size_t *input_bytes,
+                              size_t n_inputs,
+                              char *scratch_ptr, size_t scratch_bytes,
+                              int gpu_id) const
+{
+#ifdef DEBUG_CORE
+  // Validate graph_index
+  assert(graph_index >= 0 && graph_index < sizeof(TaskGraphMask)*8);
+  has_executed_graph |= 1 << graph_index;
+#endif
+
+  // Validate timestep and point
+  assert(0 <= timestep && timestep < timesteps);
+
+  long offset = offset_at_timestep(timestep);
+  long width = width_at_timestep(timestep);
+  assert(offset <= point && point < offset+width);
+
+  long last_offset = offset_at_timestep(timestep-1);
+  long last_width = width_at_timestep(timestep-1);
+
+  // Validate input
+  {
+    size_t idx = 0;
+    long dset = dependence_set_at_timestep(timestep);
+    size_t max_deps = num_dependencies(dset, point);
+    std::pair<long, long> *deps = reinterpret_cast<std::pair<long, long> *>(alloca(sizeof(std::pair<long, long>) * max_deps));
+    size_t num_deps = dependencies(dset, point, deps);
+    for (size_t span = 0; span < num_deps; span++) {
+      for (long dep = deps[span].first; dep <= deps[span].second; dep++) {
+        if (last_offset <= dep && dep < last_offset + last_width) {
+          assert(idx < n_inputs);
+
+          assert(input_bytes[idx] == output_bytes_per_task);
+          assert(input_bytes[idx] >= sizeof(std::pair<long, long>));
+
+          const std::pair<long, long> *input = reinterpret_cast<const std::pair<long, long> *>(input_ptr[idx]);
+          for (size_t i = 0; i < input_bytes[idx]/sizeof(std::pair<long, long>); ++i) {
+            assert(input[i].first == timestep - 1);
+            assert(input[i].second == dep);
+          }
+          idx++;
+        }
+      }
+    }
+    // FIXME (Elliott): Legion is currently passing in uninitialized
+    // memory for dependencies outside of the last offset/width.
+    // assert(idx == n_inputs);
+  }
+
+  // Validate output
+  assert(output_bytes == output_bytes_per_task);
+  assert(output_bytes >= sizeof(std::pair<long, long>));
+
+  // Generate output
+  std::pair<long, long> *output = reinterpret_cast<std::pair<long, long> *>(output_ptr);
+  for (size_t i = 0; i < output_bytes/sizeof(std::pair<long, long>); ++i) {
+    output[i].first = timestep;
+    output[i].second = point;
+  }
+
+  // Validate scratch
+  assert(scratch_bytes == scratch_bytes_per_task);
+  if (scratch_bytes > 0) {
+    uint64_t *scratch = reinterpret_cast<uint64_t *>(scratch_ptr);
+    assert(*scratch == MAGIC_VALUE);
+  }
+
+  // Execute kernel
+  Kernel k(kernel);
+  k.execute(graph_index, timestep, point, scratch_ptr, scratch_bytes, gpu_id);
+}
+
+void TaskGraph::execute_point(long timestep, long point,
+                              char *output_ptr, size_t output_bytes,
+                              const char **input_ptr, const size_t *input_bytes,
+                              size_t n_inputs,
+                              char *scratch_ptr, size_t scratch_bytes,
+                              char *device_ptr, size_t device_bytes) const
+{
+#ifdef DEBUG_CORE
+  // Validate graph_index
+  assert(graph_index >= 0 && graph_index < sizeof(TaskGraphMask)*8);
+  has_executed_graph |= 1 << graph_index;
+#endif
+
+  // Validate timestep and point
+  assert(0 <= timestep && timestep < timesteps);
+
+  long offset = offset_at_timestep(timestep);
+  long width = width_at_timestep(timestep);
+  assert(offset <= point && point < offset+width);
+
+  long last_offset = offset_at_timestep(timestep-1);
+  long last_width = width_at_timestep(timestep-1);
+
+  // Validate input
+  {
+    size_t idx = 0;
+    long dset = dependence_set_at_timestep(timestep);
+    size_t max_deps = num_dependencies(dset, point);
+    std::pair<long, long> *deps = reinterpret_cast<std::pair<long, long> *>(alloca(sizeof(std::pair<long, long>) * max_deps));
+    size_t num_deps = dependencies(dset, point, deps);
+    for (size_t span = 0; span < num_deps; span++) {
+      for (long dep = deps[span].first; dep <= deps[span].second; dep++) {
+        if (last_offset <= dep && dep < last_offset + last_width) {
+          assert(idx < n_inputs);
+
+          assert(input_bytes[idx] == output_bytes_per_task);
+          assert(input_bytes[idx] >= sizeof(std::pair<long, long>));
+
+          const std::pair<long, long> *input = reinterpret_cast<const std::pair<long, long> *>(input_ptr[idx]);
+          for (size_t i = 0; i < input_bytes[idx]/sizeof(std::pair<long, long>); ++i) {
+            assert(input[i].first == timestep - 1);
+            assert(input[i].second == dep);
+          }
+          idx++;
+        }
+      }
+    }
+    // FIXME (Elliott): Legion is currently passing in uninitialized
+    // memory for dependencies outside of the last offset/width.
+    // assert(idx == n_inputs);
+  }
+
+  // Validate output
+  assert(output_bytes == output_bytes_per_task);
+  assert(output_bytes >= sizeof(std::pair<long, long>));
+
+  // Generate output
+  std::pair<long, long> *output = reinterpret_cast<std::pair<long, long> *>(output_ptr);
+  for (size_t i = 0; i < output_bytes/sizeof(std::pair<long, long>); ++i) {
+    output[i].first = timestep;
+    output[i].second = point;
+  }
+
+  // Validate scratch
+  assert(scratch_bytes == scratch_bytes_per_task);
+  if (scratch_bytes > 0) {
+    uint64_t *scratch = reinterpret_cast<uint64_t *>(scratch_ptr);
+    assert(*scratch == MAGIC_VALUE);
+  }
+
+  // Execute kernel
+  Kernel k(kernel);
+  k.execute(graph_index, timestep, point, scratch_ptr, scratch_bytes, device_ptr, device_bytes);
+}
+
 void TaskGraph::prepare_scratch(char *scratch_ptr, size_t scratch_bytes)
 {
   assert(scratch_bytes % sizeof(uint64_t) == 0);
@@ -654,7 +835,7 @@ static TaskGraph default_graph(long graph_index)
   graph.period = -1;
   graph.fraction_connected = 0.25;
 #ifdef ENABLE_CUDA
-  graph.kernel = {KernelType::EMPTY, 0, 16, 0.0, 1, 32, 0, 0, 1};
+  graph.kernel = {KernelType::EMPTY, 0, 16, 0.0, 1, 32, 0, 1};
 #else
   graph.kernel = {KernelType::EMPTY, 0, 16, 0.0};
 #endif
@@ -962,9 +1143,6 @@ App::App(int argc, char **argv)
   
   check();
   
-#ifdef ENABLE_CUDA
-  init_cuda_support(graphs);
-#endif
 }
 
 void App::check() const
@@ -1253,8 +1431,5 @@ void App::report_timing(double elapsed_seconds) const
 #ifdef DEBUG_CORE
   printf("Task Graph Execution Mask %llx\n", has_executed_graph.load());
 #endif
-  
-#ifdef ENABLE_CUDA
-  fini_cuda_support();
-#endif
+
 }
diff --git a/core/core.h b/core/core.h
index 5b8da472..85866851 100644
--- a/core/core.h
+++ b/core/core.h
@@ -34,6 +34,10 @@ struct Kernel : public kernel_t {
 private:
   void execute(long graph_index, long timestep, long point,
                char *scratch_ptr, size_t scratch_bytes) const;
+  void execute(long graph_index, long timestep, long point,
+               char *scratch_ptr, size_t scratch_bytes, int gpu_id) const;
+  void execute(long graph_index, long timestep, long point,
+               char *scratch_ptr, size_t scratch_bytes, char* device_ptr, size_t device_bytes) const;
   friend struct TaskGraph;
 };
 
@@ -69,6 +73,18 @@ struct TaskGraph : public task_graph_t {
                      const char **input_ptr, const size_t *input_bytes,
                      size_t n_inputs,
                      char *scratch_ptr, size_t scratch_bytes) const;
+  void execute_point(long timestep, long point,
+                     char *output_ptr, size_t output_bytes,
+                     const char **input_ptr, const size_t *input_bytes,
+                     size_t n_inputs,
+                     char *scratch_ptr, size_t scratch_bytes,
+                     int gpu_id) const;
+  void execute_point(long timestep, long point,
+                     char *output_ptr, size_t output_bytes,
+                     const char **input_ptr, const size_t *input_bytes,
+                     size_t n_inputs,
+                     char *scratch_ptr, size_t scratch_bytes,
+                     char *device_ptr, size_t device_bytes) const;
   static void prepare_scratch(char *scratch_ptr, size_t scratch_bytes);
 };
 
diff --git a/core/core_c.h b/core/core_c.h
index 5f3847c4..7a2bac5c 100644
--- a/core/core_c.h
+++ b/core/core_c.h
@@ -63,7 +63,6 @@ typedef struct kernel_t {
   int nb_blocks;
   int threads_per_block;
   int memcpy_required;
-  int gpu_id;
   int cuda_unroll;
 #endif
 } kernel_t;
diff --git a/core/cuda_kernel.cu b/core/cuda_kernel.cu
index 1c3a375d..714cc856 100644
--- a/core/cuda_kernel.cu
+++ b/core/cuda_kernel.cu
@@ -3,7 +3,6 @@
 #include "cuda_kernel.h"
 
 std::vector<char*> local_buffer;
-int nb_local_buffer = 0;
 size_t local_buffer_size;
 cudaStream_t cuda_stream_array[8];
 
@@ -27,61 +26,88 @@ __global__ void execute_kernel_compute_cuda_kernel_unroll_4(long iter, double *A
 __global__ void execute_kernel_compute_cuda_kernel_unroll_8(long iter, double *A);
 __global__ void execute_kernel_compute_cuda_kernel_unroll_16(long iter, double *A);
 
-void init_cuda_support(const std::vector<TaskGraph> &graphs)
+void init_cuda_support(const std::vector<TaskGraph> &graphs, const std::vector<int> &local_gpus)
 {
-  int nb_gpus = 1;
+  int nb_gpus = local_gpus.size();
   
-  nb_local_buffer = nb_gpus;
-  local_buffer.reserve(nb_local_buffer);
+  local_buffer.reserve(nb_gpus);
   int nb_blocks = graphs[0].kernel.nb_blocks;
   int threads_per_block = graphs[0].kernel.threads_per_block;
   int cuda_unroll = graphs[0].kernel.cuda_unroll;
-  printf("init cuda support nb_blocks %d, threads_per_block %d, cuda_unroll %d\n", nb_blocks, threads_per_block, cuda_unroll);
+  // printf("init cuda support nb_gpus %d nb_blocks %d, threads_per_block %d, cuda_unroll %d\n", nb_gpus, nb_blocks, threads_per_block, cuda_unroll);
   local_buffer_size = nb_blocks * threads_per_block * sizeof(double);
   for (int i = 0; i < nb_gpus; i++) {
-    gpuErrchk( cudaSetDevice(0) );
+    gpuErrchk( cudaSetDevice(local_gpus[i]) );
     gpuErrchk( cudaMalloc((void**)&(local_buffer[i]), sizeof(double) * nb_blocks * threads_per_block * cuda_unroll) );
     assert(local_buffer[i] != NULL);
     gpuErrchk( cudaStreamCreate(&(cuda_stream_array[i])) );
   }
 }
 
-void fini_cuda_support()
+void fini_cuda_support(const std::vector<int> &local_gpus)
 {
-  for (int i = 0; i < nb_local_buffer; i++) {
-    gpuErrchk( cudaSetDevice(0) );
+  // printf("fini cuda support nb_gpus %d\n", local_gpus.size());
+  for (int i = 0; i < local_buffer.size(); i++) {
+    gpuErrchk( cudaSetDevice(local_gpus[i]) );
     gpuErrchk( cudaFree(local_buffer[i]) );
     local_buffer[i] = NULL;
     gpuErrchk( cudaStreamDestroy(cuda_stream_array[i]) );
   }
+  local_buffer.clear();
 }
 
-void execute_kernel_compute_cuda(const Kernel &kernel, char *scratch_ptr, size_t scratch_bytes)
+void execute_kernel_compute_cuda(const Kernel &kernel, char *scratch_ptr, size_t scratch_bytes, int gpu_id)
 {
-//  printf("CUDA COMPUTE KERNEL buffer %p, size %lld, nb_blocks %d, threads_per_block %d\n", scratch_ptr, scratch_bytes, kernel.nb_blocks, kernel.threads_per_block);
+  // printf("CUDA COMPUTE KERNEL buffer %p, size %lld, nb_blocks %d, threads_per_block %d\n", scratch_ptr, scratch_bytes, kernel.nb_blocks, kernel.threads_per_block);
   assert(scratch_bytes <= local_buffer_size);
-  assert(kernel.gpu_id == 0);
-    
+
+  if (kernel.memcpy_required == 1) {
+ //   printf("enable memcpy in\n");
+    gpuErrchk( cudaMemcpyAsync(local_buffer[gpu_id], scratch_ptr, scratch_bytes, cudaMemcpyHostToDevice, cuda_stream_array[gpu_id]) ); 
+    gpuErrchk( cudaStreamSynchronize(cuda_stream_array[gpu_id]) );
+  }
+  if (kernel.cuda_unroll == 4) {
+    execute_kernel_compute_cuda_kernel_unroll_4<<<kernel.nb_blocks, kernel.threads_per_block, 0, cuda_stream_array[gpu_id]>>>(kernel.iterations, (double *)local_buffer[gpu_id]);
+  } else if (kernel.cuda_unroll == 8) {
+    execute_kernel_compute_cuda_kernel_unroll_8<<<kernel.nb_blocks, kernel.threads_per_block, 0, cuda_stream_array[gpu_id]>>>(kernel.iterations, (double *)local_buffer[gpu_id]);
+  } else if (kernel.cuda_unroll == 16) {
+    execute_kernel_compute_cuda_kernel_unroll_16<<<kernel.nb_blocks, kernel.threads_per_block, 0, cuda_stream_array[gpu_id]>>>(kernel.iterations, (double *)local_buffer[gpu_id]);
+  } else {
+    execute_kernel_compute_cuda_kernel_unroll_1<<<kernel.nb_blocks, kernel.threads_per_block, 0, cuda_stream_array[gpu_id]>>>(kernel.iterations, (double *)local_buffer[gpu_id]);
+  }
+  gpuErrchk( cudaPeekAtLastError() );
+  gpuErrchk( cudaStreamSynchronize(cuda_stream_array[gpu_id]) );
+  if (kernel.memcpy_required == 1) {
+ //   printf("enable memcpy out\n");
+    gpuErrchk( cudaMemcpyAsync(scratch_ptr, local_buffer[gpu_id], scratch_bytes, cudaMemcpyDeviceToHost, cuda_stream_array[gpu_id]) );
+    gpuErrchk( cudaStreamSynchronize(cuda_stream_array[gpu_id]) );
+  }
+}
+
+void execute_kernel_compute_cuda(const Kernel &kernel, char *scratch_ptr, size_t scratch_bytes, char *device_ptr, size_t device_bytes) {
+  printf("CUDA COMPUTE KERNEL scratch_ptr %p, size %lld, nb_blocks %d, threads_per_block %d, device_ptr %p, size %lld\n", scratch_ptr, scratch_bytes, kernel.nb_blocks, kernel.threads_per_block, device_ptr, device_bytes);
+  assert(scratch_bytes <= device_bytes);
+
   if (kernel.memcpy_required == 1) {
  //   printf("enable memcpy in\n");
-    gpuErrchk( cudaMemcpyAsync(local_buffer[kernel.gpu_id], scratch_ptr, scratch_bytes, cudaMemcpyHostToDevice, cuda_stream_array[kernel.gpu_id]) ); 
-    gpuErrchk( cudaStreamSynchronize(cuda_stream_array[kernel.gpu_id]) );
+    gpuErrchk( cudaMemcpyAsync(device_ptr, scratch_ptr, scratch_bytes, cudaMemcpyHostToDevice, 0) ); 
+    gpuErrchk( cudaStreamSynchronize(0) );
   }
   if (kernel.cuda_unroll == 4) {
-    execute_kernel_compute_cuda_kernel_unroll_4<<<kernel.nb_blocks, kernel.threads_per_block, 0, cuda_stream_array[kernel.gpu_id]>>>(kernel.iterations, (double *)local_buffer[kernel.gpu_id]);
+    execute_kernel_compute_cuda_kernel_unroll_4<<<kernel.nb_blocks, kernel.threads_per_block, 0, 0>>>(kernel.iterations, (double *)device_ptr);
   } else if (kernel.cuda_unroll == 8) {
-    execute_kernel_compute_cuda_kernel_unroll_8<<<kernel.nb_blocks, kernel.threads_per_block, 0, cuda_stream_array[kernel.gpu_id]>>>(kernel.iterations, (double *)local_buffer[kernel.gpu_id]);
+    execute_kernel_compute_cuda_kernel_unroll_8<<<kernel.nb_blocks, kernel.threads_per_block, 0, 0>>>(kernel.iterations, (double *)device_ptr);
   } else if (kernel.cuda_unroll == 16) {
-    execute_kernel_compute_cuda_kernel_unroll_16<<<kernel.nb_blocks, kernel.threads_per_block, 0, cuda_stream_array[kernel.gpu_id]>>>(kernel.iterations, (double *)local_buffer[kernel.gpu_id]);
+    execute_kernel_compute_cuda_kernel_unroll_16<<<kernel.nb_blocks, kernel.threads_per_block, 0, 0>>>(kernel.iterations, (double *)device_ptr);
   } else {
-    execute_kernel_compute_cuda_kernel_unroll_1<<<kernel.nb_blocks, kernel.threads_per_block, 0, cuda_stream_array[kernel.gpu_id]>>>(kernel.iterations, (double *)local_buffer[kernel.gpu_id]);
+    execute_kernel_compute_cuda_kernel_unroll_1<<<kernel.nb_blocks, kernel.threads_per_block, 0, 0>>>(kernel.iterations, (double *)device_ptr);
   }
   gpuErrchk( cudaPeekAtLastError() );
-  gpuErrchk( cudaStreamSynchronize(cuda_stream_array[kernel.gpu_id]) );
+  gpuErrchk( cudaStreamSynchronize(0) );
   if (kernel.memcpy_required == 1) {
  //   printf("enable memcpy out\n");
-    gpuErrchk( cudaMemcpyAsync(scratch_ptr, local_buffer[kernel.gpu_id], scratch_bytes, cudaMemcpyDeviceToHost, cuda_stream_array[kernel.gpu_id]) );
-    gpuErrchk( cudaStreamSynchronize(cuda_stream_array[kernel.gpu_id]) );
+    gpuErrchk( cudaMemcpyAsync(scratch_ptr, device_ptr, scratch_bytes, cudaMemcpyDeviceToHost, 0) );
+    gpuErrchk( cudaStreamSynchronize(0) );
   }
 }
 
diff --git a/core/cuda_kernel.h b/core/cuda_kernel.h
index 4a6ba2a8..caea05a6 100644
--- a/core/cuda_kernel.h
+++ b/core/cuda_kernel.h
@@ -11,10 +11,12 @@ extern std::vector<char*> local_buffer;
 
 extern size_t local_buffer_size;
 
-void init_cuda_support(const std::vector<TaskGraph> &graphs);
+void init_cuda_support(const std::vector<TaskGraph> &graphs, const std::vector<int> &local_gpus);
 
-void fini_cuda_support();
+void fini_cuda_support(const std::vector<int> &local_gpus);
 
-void execute_kernel_compute_cuda(const Kernel &kernel, char *scratch_ptr, size_t scratch_bytes);
+void execute_kernel_compute_cuda(const Kernel &kernel, char *scratch_ptr, size_t scratch_bytes, int gpu_id);
+
+void execute_kernel_compute_cuda(const Kernel &kernel, char *scratch_ptr, size_t scratch_bytes, char *device_ptr, size_t device_bytes);
 
 #endif
\ No newline at end of file
diff --git a/mpi_gpu/nonblock_gpu.cc b/mpi_gpu/nonblock_gpu.cc
index 2e45e6d3..7f62c7cb 100644
--- a/mpi_gpu/nonblock_gpu.cc
+++ b/mpi_gpu/nonblock_gpu.cc
@@ -22,7 +22,9 @@
 #include "core.h"
 
 #include "mpi.h"
+#include "cuda_kernel.h"
 
+// Only support one gpu per MPI process
 int main(int argc, char *argv[])
 {
   MPI_Init(&argc, &argv);
@@ -31,6 +33,14 @@ int main(int argc, char *argv[])
   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 
   App app(argc, argv);
+
+  int local_rank, local_size;
+  MPI_Comm MPI_COMM_LOCAL;
+  MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0 /* key */, MPI_INFO_NULL, &MPI_COMM_LOCAL);
+  MPI_Comm_rank(MPI_COMM_LOCAL, &local_rank);
+  std::vector<int> local_gpus(1, local_rank);
+  init_cuda_support(app.graphs, local_gpus);
+
   if (rank == 0) app.display();
 
   double elapsed_time = 0.0;
@@ -177,7 +187,7 @@ int main(int argc, char *argv[])
             graph.execute_point(timestep, point,
                                 point_output.data(), point_output.size(),
                                 point_input_ptr.data(), point_input_bytes.data(), point_n_inputs,
-                                scratch_ptr, scratch_bytes);
+                                scratch_ptr, scratch_bytes, 0);
           }
         }
       }
@@ -195,5 +205,7 @@ int main(int argc, char *argv[])
     app.report_timing(elapsed_time);
   }
 
+  fini_cuda_support(local_gpus);
+
   MPI_Finalize();
 }
diff --git a/realm_gpu/Makefile b/realm_gpu/Makefile
new file mode 100644
index 00000000..3070d44d
--- /dev/null
+++ b/realm_gpu/Makefile
@@ -0,0 +1,58 @@
+# Copyright 2020 Stanford University
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+ifndef LG_RT_DIR
+$(error LG_RT_DIR variable is not defined, aborting build)
+endif
+
+# Flags for directing the runtime makefile what to include
+DEBUG           ?= 0		# Include debugging symbols
+OUTPUT_LEVEL    ?= LEVEL_DEBUG	# Compile time logging level
+USE_CUDA        ?= 0		# Include CUDA support (requires CUDA)
+USE_GASNET      ?= 0		# Include GASNet support (requires GASNet)
+USE_HDF         ?= 0		# Include HDF5 support (requires HDF5)
+ALT_MAPPERS     ?= 0		# Include alternative mappers (not recommended)
+
+# Put the binary file name here
+OUTFILE		?= task_bench
+# List all the application source files here
+GEN_SRC		?= main.cc		# .cc files
+GEN_GPU_SRC	?=		# .cu files
+
+# You can modify these variables, some will be appended to by the runtime makefile
+INC_FLAGS	?=
+CC_FLAGS	?=
+NVCC_FLAGS	?=
+GASNET_FLAGS	?=
+LD_FLAGS	?=
+
+CC_FLAGS += -std=c++11 -I../core
+LD_FLAGS += -L../core -lcore_s
+
+include ../core/make_blas.mk
+include ./make_cuda_realm.mk
+###########################################################################
+#
+#   Don't change anything below here
+#
+###########################################################################
+
+include $(LG_RT_DIR)/runtime.mk
+
+# Make absolutely sure application code is compiled with -O3 (in non-debug mode)
+ifneq ($(strip $(DEBUG)),1)
+$(GEN_OBJS) : CC_FLAGS += -O3
+endif
diff --git a/realm_gpu/main.cc b/realm_gpu/main.cc
new file mode 100644
index 00000000..eb4ca31d
--- /dev/null
+++ b/realm_gpu/main.cc
@@ -0,0 +1,1171 @@
+/* Copyright 2020 Stanford University
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "main.h"
+
+#include <algorithm>
+#include <climits>
+#include <vector>
+
+#include <float.h>
+
+#include "realm.h"
+
+#include "core.h"
+
+#include <unistd.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+using namespace Realm;
+using namespace Realm::Serialization;
+
+TYPE_IS_SERIALIZABLE(TaskGraph);
+
+// Hack: Track argc/argv in globals to avoid needing to serialize arguments to top_level task.
+static int global_argc = 0;
+static char **global_argv = NULL;
+
+enum {
+  TOP_LEVEL_TASK = Processor::TASK_ID_FIRST_AVAILABLE,
+  SHARD_TASK,
+  LEAF_TASK,
+};
+
+enum {
+  REDOP_MIN = 11,
+  REDOP_MAX = 12,
+};
+
+enum {
+  FID_FIRST = 100,
+};
+
+#define DECLARE_REDUCTION(CLASS, T, U, APPLY_OP, FOLD_OP, ID)                  \
+  class CLASS {                                                                \
+  public:                                                                      \
+    typedef T LHS, RHS;                                                        \
+    template <bool EXCLUSIVE>                                                  \
+    static void apply(LHS &lhs, RHS rhs);                                      \
+    template <bool EXCLUSIVE>                                                  \
+    static void fold(RHS &rhs1, RHS rhs2);                                     \
+    static const T identity;                                                   \
+  };                                                                           \
+                                                                               \
+  const T CLASS::identity = ID;                                                \
+                                                                               \
+  template <>                                                                  \
+  void CLASS::apply<true>(LHS & lhs, RHS rhs)                                  \
+  {                                                                            \
+    lhs = APPLY_OP(lhs, rhs);                                                  \
+  }                                                                            \
+                                                                               \
+  template <>                                                                  \
+  void CLASS::apply<false>(LHS & lhs, RHS rhs)                                 \
+  {                                                                            \
+    volatile U *target = (U *)&(lhs);                                          \
+    union {                                                                    \
+      U as_U;                                                                  \
+      T as_T;                                                                  \
+    } oldval, newval;                                                          \
+    do {                                                                       \
+      oldval.as_U = *target;                                                   \
+      newval.as_T = APPLY_OP(oldval.as_T, rhs);                                \
+    } while (!__sync_bool_compare_and_swap(target, oldval.as_U, newval.as_U)); \
+  }                                                                            \
+                                                                               \
+  template <>                                                                  \
+  void CLASS::fold<true>(RHS & rhs1, RHS rhs2)                                 \
+  {                                                                            \
+    rhs1 = FOLD_OP(rhs1, rhs2);                                                \
+  }                                                                            \
+                                                                               \
+  template <>                                                                  \
+  void CLASS::fold<false>(RHS & rhs1, RHS rhs2)                                \
+  {                                                                            \
+    volatile U *target = (U *)&rhs1;                                           \
+    union {                                                                    \
+      U as_U;                                                                  \
+      T as_T;                                                                  \
+    } oldval, newval;                                                          \
+    do {                                                                       \
+      oldval.as_U = *target;                                                   \
+      newval.as_T = FOLD_OP(oldval.as_T, rhs2);                                \
+    } while (!__sync_bool_compare_and_swap(target, oldval.as_U, newval.as_U)); \
+  }
+
+DECLARE_REDUCTION(RedopMin, unsigned long long, unsigned long long,
+                  std::min, std::min, std::numeric_limits<unsigned long long>::max())
+DECLARE_REDUCTION(RedopMax, unsigned long long, unsigned long long,
+                  std::max, std::max, std::numeric_limits<unsigned long long>::min())
+
+#undef DECLARE_REDUCTION
+
+Event copy(RegionInstance src_inst, RegionInstance dst_inst, FieldID fid,
+           size_t value_size, Event wait_for)
+{
+  CopySrcDstField src_field;
+  src_field.inst = src_inst;
+  src_field.field_id = fid;
+  src_field.size = value_size;
+
+  std::vector<CopySrcDstField> src_fields;
+  src_fields.push_back(src_field);
+
+  CopySrcDstField dst_field;
+  dst_field.inst = dst_inst;
+  dst_field.field_id = fid;
+  dst_field.size = value_size;
+
+  std::vector<CopySrcDstField> dst_fields;
+  dst_fields.push_back(dst_field);
+
+  return dst_inst.get_indexspace<1, coord_t>().copy(
+      src_fields, dst_fields, ProfilingRequestSet(), wait_for);
+}
+
+Event fill(RegionInstance dst_inst, FieldID fid,
+           const void *value, size_t value_size,
+           Event wait_for)
+{
+  CopySrcDstField dst_field;
+  dst_field.inst = dst_inst;
+  dst_field.field_id = fid;
+  dst_field.size = value_size;
+
+  std::vector<CopySrcDstField> dst_fields;
+  dst_fields.push_back(dst_field);
+  return dst_inst.get_indexspace<1, coord_t>().fill(
+      dst_fields, ProfilingRequestSet(), value, value_size, wait_for);
+}
+
+char *get_base(RegionInstance inst, FieldID fid)
+{
+  AffineAccessor<char, 1, coord_t> acc =
+      AffineAccessor<char, 1, coord_t>(inst, fid);
+  return reinterpret_cast<char *>(
+      acc.ptr(inst.get_indexspace<1, coord_t>().bounds.lo));
+}
+
+void leaf_task(const void *args, size_t arglen, const void *userdata,
+               size_t userlen, Processor p)
+{
+  LeafArgs a;
+  std::vector<uintptr_t> input_ptr;
+  std::vector<size_t> input_bytes;
+  {
+    FixedBufferDeserializer ser(args, arglen);
+    ser >> a;
+    ser >> input_ptr;
+    ser >> input_bytes;
+    assert(ser.bytes_left() == 0);
+  }
+
+  // std::cout << "leaf_task: " << " point " << a.point << " timestep " << a.timestep << std::endl;
+  a.graph.execute_point(a.timestep, a.point,
+                        a.output_ptr, a.output_bytes,
+                        (const char **)input_ptr.data(), (size_t *)input_bytes.data(), a.n_inputs,
+                        a.scratch_ptr, a.scratch_bytes, a.device_ptr, a.device_bytes);
+}
+
+void shard_task(const void *args, size_t arglen, const void *userdata,
+               size_t userlen, Processor p)
+{
+  ShardArgs a;
+  std::vector<TaskGraph> graphs;
+  std::vector<std::vector<RegionInstance> > task_results;
+  std::vector<std::vector<RegionInstance> > task_inputs;
+  std::vector<std::vector<RegionInstance> > raw_exchange;
+  std::vector<std::vector<RegionInstance> > war_exchange;
+  {
+    FixedBufferDeserializer ser(args, arglen);
+    ser >> a;
+    ser >> graphs;
+    ser >> task_results;
+    ser >> task_inputs;
+    ser >> raw_exchange;
+    ser >> war_exchange;
+
+    assert(ser.bytes_left() == 0);
+  }
+
+  long proc_index = a.proc_index;
+  long num_procs = a.num_procs;
+  long num_fields = a.num_fields;
+  long force_copies = a.force_copies;
+  Memory gpumem = a.gpumem;
+  Memory zcopymem = a.zcopymem;
+  Barrier sync = a.sync;
+  Barrier first_start = a.first_start;
+  Barrier last_start = a.last_start;
+  Barrier first_stop = a.first_stop;
+  Barrier last_stop = a.last_stop;
+
+  RegionInstance device_inst;
+  size_t device_bytes;
+  char *device_ptr;
+  {
+    std::map<FieldID, size_t> field_sizes;
+    field_sizes[FID_FIRST] = sizeof(char);
+
+    int nb_blocks = graphs[0].kernel.nb_blocks;
+    int threads_per_block = graphs[0].kernel.threads_per_block;
+    int cuda_unroll = graphs[0].kernel.cuda_unroll;
+    device_bytes = nb_blocks * threads_per_block * sizeof(double);
+    Rect1 bounds(Point1(0), Point1(device_bytes * cuda_unroll));
+    RegionInstance::create_instance(device_inst, a.gpumem, bounds, field_sizes,
+                                        0 /*SOA*/, ProfilingRequestSet()).wait();
+    device_ptr = get_base(device_inst, FID_FIRST);
+  }
+  // Figure out who we're going to be communicating with.
+
+  // graph -> point -> [remote point]
+  std::vector<std::vector<std::set<long> > > raw_exchange_points(graphs.size());
+  std::vector<std::vector<std::set<long> > > war_exchange_points(graphs.size());
+
+  // graph -> [remote point]
+  std::vector<std::set<long> > raw_all_points(graphs.size());
+  std::vector<std::set<long> > war_all_points(graphs.size());
+  for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) {
+    auto graph = graphs.at(graph_index);
+
+    long first_point = proc_index * graph.max_width / num_procs;
+    long last_point = (proc_index + 1) * graph.max_width / num_procs - 1;
+
+    raw_exchange_points.at(graph_index).resize(last_point - first_point + 1);
+    war_exchange_points.at(graph_index).resize(last_point - first_point + 1);
+
+    for (long point = first_point; point <= last_point; ++point) {
+      long max_dset = graph.max_dependence_sets();
+      for (long dset = 0; dset < max_dset; ++dset) {
+        for (auto interval : graph.dependencies(dset, point)) {
+          for (long dep = interval.first; dep <= interval.second; ++dep) {
+            raw_exchange_points.at(graph_index).at(point - first_point).insert(dep);
+            raw_all_points.at(graph_index).insert(dep);
+          }
+        }
+        for (auto interval : graph.reverse_dependencies(dset, point)) {
+          for (long dep = interval.first; dep <= interval.second; ++dep) {
+            war_exchange_points.at(graph_index).at(point - first_point).insert(dep);
+            war_all_points.at(graph_index).insert(dep);
+          }
+        }
+      }
+    }
+  }
+
+  // graph -> point -> dset -> [remote point]
+  std::vector<std::vector<std::vector<std::vector<long> > > > raw_points_not_in_dset(graphs.size());
+  std::vector<std::vector<std::vector<std::vector<long> > > > war_points_not_in_dset(graphs.size());
+  for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) {
+    auto graph = graphs.at(graph_index);
+
+    long first_point = proc_index * graph.max_width / num_procs;
+    long last_point = (proc_index + 1) * graph.max_width / num_procs - 1;
+
+    raw_points_not_in_dset.at(graph_index).resize(last_point - first_point + 1);
+    war_points_not_in_dset.at(graph_index).resize(last_point - first_point + 1);
+
+    for (long point = first_point; point <= last_point; ++point) {
+      long max_dset = graph.max_dependence_sets();
+
+      raw_points_not_in_dset.at(graph_index).at(point - first_point).resize(max_dset);
+      war_points_not_in_dset.at(graph_index).at(point - first_point).resize(max_dset);
+
+      for (long dset = 0; dset < max_dset; ++dset) {
+        std::set<long> raw_points = raw_exchange_points.at(graph_index).at(point - first_point);
+        std::set<long> war_points = war_exchange_points.at(graph_index).at(point - first_point);
+
+        for (auto interval : graph.dependencies(dset, point)) {
+          for (long dep = interval.first; dep <= interval.second; ++dep) {
+            raw_points.erase(dep);
+          }
+        }
+        for (auto interval : graph.reverse_dependencies(dset, point)) {
+          for (long dep = interval.first; dep <= interval.second; ++dep) {
+            war_points.erase(dep);
+          }
+        }
+
+        raw_points_not_in_dset.at(graph_index).at(point - first_point).at(dset).assign(raw_points.begin(), raw_points.end());
+        war_points_not_in_dset.at(graph_index).at(point - first_point).at(dset).assign(war_points.begin(), war_points.end());
+      }
+    }
+  }
+
+  // Create local input regions.
+
+  // graph -> point -> max deps
+  std::vector<std::vector<long> > max_deps(graphs.size());
+  long all_max_deps = 0;
+  for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) {
+    auto graph = graphs.at(graph_index);
+
+    max_deps.at(graph_index).resize(graph.max_width, 0);
+
+    for (long point = 0; point < graph.max_width; ++point) {
+      long max_dset = graph.max_dependence_sets();
+      for (long dset = 0; dset < max_dset; ++dset) {
+        long deps = 0;
+        for (auto interval : graph.dependencies(dset, point)) {
+          deps += interval.second - interval.first + 1;
+        }
+        max_deps.at(graph_index).at(point) = std::max(max_deps.at(graph_index).at(point), deps);
+        all_max_deps = std::max(all_max_deps, deps);
+      }
+    }
+  }
+
+  // graph -> point -> dep -> instance
+  std::vector<std::vector<std::vector<RegionInstance> > > inputs(graphs.size());
+  // graph -> point -> dep -> field -> base pointer
+  std::vector<std::vector<std::vector<std::vector<char *> > > > input_base(graphs.size());
+  for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) {
+    auto graph = graphs.at(graph_index);
+
+    Rect1 bounds(Point1(0), Point1(graph.output_bytes_per_task - 1));
+
+    std::map<FieldID, size_t> field_sizes;
+    for (unsigned fid = FID_FIRST; fid < FID_FIRST + num_fields; ++fid) {
+      field_sizes[fid] = sizeof(char);
+    }
+
+    long first_point = proc_index * graph.max_width / num_procs;
+    long last_point = (proc_index + 1) * graph.max_width / num_procs - 1;
+
+    inputs.at(graph_index).resize(last_point - first_point + 1);
+    input_base.at(graph_index).resize(last_point - first_point + 1);
+
+    for (long point = first_point; point <= last_point; ++point) {
+      long deps = max_deps.at(graph_index).at(point);
+
+      inputs.at(graph_index).at(point - first_point).resize(deps);
+      input_base.at(graph_index).at(point - first_point).resize(deps);
+
+      AffineAccessor<RegionInstance, 1, coord_t> task_input =
+        AffineAccessor<RegionInstance, 1, coord_t>(task_inputs.at(graph_index).at(point), FID_FIRST);
+
+      for (long dep = 0; dep < deps; ++dep) {
+        RegionInstance &inst = inputs.at(graph_index).at(point - first_point).at(dep);
+        RegionInstance::create_instance(inst, zcopymem, bounds, field_sizes,
+                                        0 /*SOA*/, ProfilingRequestSet())
+          .wait();
+
+        input_base.at(graph_index).at(point - first_point).at(dep).resize(num_fields);
+        for (long fid = FID_FIRST; fid < FID_FIRST + num_fields; ++fid) {
+          input_base.at(graph_index).at(point - first_point).at(dep).at(fid - FID_FIRST) =
+            get_base(inst, fid);
+        }
+
+        task_input[dep] = inst;
+      }
+      for (long dep = deps; dep < all_max_deps; ++dep) {
+        task_input[dep] = RegionInstance::NO_INST;
+      }
+    }
+  }
+
+  // Create barriers.
+
+  // graph -> point -> field -> remote point -> barrier
+  std::vector<std::vector<std::vector<std::map<long, Barrier> > > > raw_in(graphs.size());
+  std::vector<std::vector<std::vector<std::map<long, Barrier> > > > war_in(graphs.size());
+  for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) {
+    auto graph = graphs.at(graph_index);
+
+    long first_point = proc_index * graph.max_width / num_procs;
+    long last_point = (proc_index + 1) * graph.max_width / num_procs - 1;
+
+    raw_in.at(graph_index).resize(last_point - first_point + 1);
+    war_in.at(graph_index).resize(last_point - first_point + 1);
+
+    for (long point = first_point; point <= last_point; ++point) {
+      auto &raw_points = raw_exchange_points.at(graph_index).at(point - first_point);
+      auto &war_points = war_exchange_points.at(graph_index).at(point - first_point);
+
+      raw_in.at(graph_index).at(point - first_point).resize(num_fields);
+      war_in.at(graph_index).at(point - first_point).resize(num_fields);
+
+      for (long fid = FID_FIRST; fid < FID_FIRST + num_fields; ++fid) {
+        const Barrier no_barrier = Barrier::NO_BARRIER;
+        fill(raw_exchange.at(graph_index).at(point), fid,
+             &no_barrier, sizeof(no_barrier), Event::NO_EVENT)
+          .wait();
+        fill(war_exchange.at(graph_index).at(point), fid,
+             &no_barrier, sizeof(no_barrier), Event::NO_EVENT)
+          .wait();
+
+        AffineAccessor<Barrier, 1, coord_t> raw =
+          AffineAccessor<Barrier, 1, coord_t>(raw_exchange.at(graph_index).at(point), fid);
+        AffineAccessor<Barrier, 1, coord_t> war =
+          AffineAccessor<Barrier, 1, coord_t>(war_exchange.at(graph_index).at(point), fid);
+
+        for (auto dep : raw_points) {
+          Barrier bar = Barrier::create_barrier(1);
+          raw_in.at(graph_index).at(point - first_point).at(fid - FID_FIRST)[dep] = bar;
+          raw[dep] = bar;
+        }
+        for (auto dep : war_points) {
+          Barrier bar = Barrier::create_barrier(1);
+          war_in.at(graph_index).at(point - first_point).at(fid - FID_FIRST)[dep] = bar;
+          war[dep] = bar;
+        }
+      }
+    }
+  }
+
+  // Perform input region/barrier exchange.
+  sync.arrive(1);
+  sync.wait();
+  sync = sync.advance_barrier();
+
+  // graph -> remote point -> instance
+  std::vector<std::map<long, RegionInstance> > raw_local_out(graphs.size());
+  std::vector<std::map<long, RegionInstance> > war_local_out(graphs.size());
+  for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) {
+    auto graph = graphs.at(graph_index);
+
+    Rect1 bounds(Point1(0), Point1(graph.max_width - 1));
+
+    std::map<FieldID, size_t> field_sizes;
+    for (unsigned fid = FID_FIRST; fid < FID_FIRST + num_fields; ++fid) {
+      field_sizes[fid] = sizeof(Barrier);
+    }
+
+    for (auto &dep : war_all_points.at(graph_index)) {
+      RegionInstance inst;
+      RegionInstance::create_instance(inst, zcopymem, bounds, field_sizes,
+                                      0 /*SOA*/, ProfilingRequestSet())
+        .wait();
+      raw_local_out.at(graph_index)[dep] = inst;
+    }
+
+    for (auto &dep : raw_all_points.at(graph_index)) {
+      RegionInstance inst;
+      RegionInstance::create_instance(inst, zcopymem, bounds, field_sizes,
+                                      0 /*SOA*/, ProfilingRequestSet())
+        .wait();
+      war_local_out.at(graph_index)[dep] = inst;
+    }
+  }
+
+  std::vector<Event> events;
+  for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) {
+    for (auto &dep : war_all_points.at(graph_index)) {
+      for (long fid = FID_FIRST; fid < FID_FIRST + num_fields; ++fid) {
+        events.push_back(
+          copy(raw_exchange.at(graph_index).at(dep),
+               raw_local_out.at(graph_index).at(dep),
+               fid, sizeof(Barrier), Event::NO_EVENT));
+      }
+    }
+
+    for (auto &dep : raw_all_points.at(graph_index)) {
+      for (long fid = FID_FIRST; fid < FID_FIRST + num_fields; ++fid) {
+        events.push_back(
+          copy(war_exchange.at(graph_index).at(dep),
+               war_local_out.at(graph_index).at(dep),
+               fid, sizeof(Barrier), Event::NO_EVENT));
+      }
+    }
+  }
+  Event::merge_events(events).wait();
+  events.clear();
+
+  // graph -> point -> field -> remote point -> barrier
+  std::vector<std::vector<std::vector<std::map<long, Barrier> > > > raw_out(graphs.size());
+  std::vector<std::vector<std::vector<std::map<long, Barrier> > > > war_out(graphs.size());
+  for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) {
+    auto graph = graphs.at(graph_index);
+
+    long first_point = proc_index * graph.max_width / num_procs;
+    long last_point = (proc_index + 1) * graph.max_width / num_procs - 1;
+
+    raw_out.at(graph_index).resize(last_point - first_point + 1);
+    war_out.at(graph_index).resize(last_point - first_point + 1);
+
+    for (long point = first_point; point <= last_point; ++point) {
+      auto &raw_points = raw_exchange_points.at(graph_index).at(point - first_point);
+      auto &war_points = war_exchange_points.at(graph_index).at(point - first_point);
+
+      raw_out.at(graph_index).at(point - first_point).resize(num_fields);
+      war_out.at(graph_index).at(point - first_point).resize(num_fields);
+
+      for (long fid = FID_FIRST; fid < FID_FIRST + num_fields; ++fid) {
+        for (auto dep : war_points) {
+          AffineAccessor<Barrier, 1, coord_t> raw =
+            AffineAccessor<Barrier, 1, coord_t>(raw_local_out.at(graph_index).at(dep), fid);
+          Barrier bar = raw[point];
+          assert(bar != Barrier::NO_BARRIER);
+          raw_out.at(graph_index).at(point - first_point).at(fid - FID_FIRST)[dep] = bar;
+        }
+        for (auto dep : raw_points) {
+          AffineAccessor<Barrier, 1, coord_t> war =
+            AffineAccessor<Barrier, 1, coord_t>(war_local_out.at(graph_index).at(dep), fid);
+          Barrier bar = war[point];
+          assert(bar != Barrier::NO_BARRIER);
+          war_out.at(graph_index).at(point - first_point).at(fid - FID_FIRST)[dep] = bar;
+        }
+      }
+    }
+  }
+
+  // graph -> remote point -> instance
+  std::vector<std::map<long, RegionInstance> > remote_task_inputs(graphs.size());
+  for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) {
+    Rect1 bounds(Point1(0), Point1(all_max_deps - 1));
+
+    std::map<FieldID, size_t> field_sizes;
+    field_sizes[FID_FIRST] = sizeof(RegionInstance);
+
+    for (auto &dep : war_all_points.at(graph_index)) {
+      RegionInstance inst;
+      RegionInstance::create_instance(inst, zcopymem, bounds, field_sizes,
+                                      0 /*SOA*/, ProfilingRequestSet())
+        .wait();
+      remote_task_inputs.at(graph_index)[dep] = inst;
+    }
+  }
+
+  for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) {
+    for (auto &dep : war_all_points.at(graph_index)) {
+      events.push_back(
+        copy(task_inputs.at(graph_index).at(dep),
+             remote_task_inputs.at(graph_index).at(dep),
+             FID_FIRST, sizeof(RegionInstance), Event::NO_EVENT));
+    }
+  }
+  Event::merge_events(events).wait();
+  events.clear();
+
+  // graph -> point -> remote point -> dep -> instance
+  std::vector<std::vector<std::map<long, std::vector<RegionInstance> > > > remote_inputs(graphs.size());
+  for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) {
+    auto graph = graphs.at(graph_index);
+
+    long first_point = proc_index * graph.max_width / num_procs;
+    long last_point = (proc_index + 1) * graph.max_width / num_procs - 1;
+
+    remote_inputs.at(graph_index).resize(last_point - first_point + 1);
+
+    for (long point = first_point; point <= last_point; ++point) {
+      auto &war_points = war_exchange_points.at(graph_index).at(point - first_point);
+
+      for (auto dep : war_points) {
+        remote_inputs.at(graph_index).at(point - first_point)[dep].resize(all_max_deps);
+
+        AffineAccessor<RegionInstance, 1, coord_t> remote_task_input =
+          AffineAccessor<RegionInstance, 1, coord_t>(remote_task_inputs.at(graph_index).at(dep), FID_FIRST);
+
+        for (long remote_dep = 0; remote_dep < all_max_deps; ++remote_dep) {
+          RegionInstance inst = remote_task_input[remote_dep];
+          remote_inputs.at(graph_index).at(point - first_point)[dep].at(remote_dep) = inst;
+        }
+      }
+    }
+  }
+
+  // graph -> point -> dset -> remote point -> slot
+  std::vector<std::vector<std::vector<std::map<long, long> > > > remote_input_slot(graphs.size());
+  for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) {
+    auto graph = graphs.at(graph_index);
+
+    long first_point = proc_index * graph.max_width / num_procs;
+    long last_point = (proc_index + 1) * graph.max_width / num_procs - 1;
+
+    remote_input_slot.at(graph_index).resize(last_point - first_point + 1);
+
+    for (long point = first_point; point <= last_point; ++point) {
+      long max_dset = graph.max_dependence_sets();
+
+      remote_input_slot.at(graph_index).at(point - first_point).resize(max_dset);
+
+      for (long dset = 0; dset < max_dset; ++dset) {
+        for (auto interval : graph.reverse_dependencies(dset, point)) {
+          for (long dep = interval.first; dep <= interval.second; ++dep) {
+            long slot = 0;
+            bool found = false;
+            for (auto interval : graph.dependencies(dset, dep)) {
+              for (long dep = interval.first; dep <= interval.second; ++dep) {
+                if (point == dep) {
+                  found = true;
+                  break;
+                }
+                slot++;
+              }
+              if (found) break;
+            }
+            assert(found);
+            remote_input_slot.at(graph_index).at(point - first_point).at(dset)[dep] = slot;
+          }
+        }
+      }
+    }
+  }
+
+  // graph -> point -> field -> base pointer
+  std::vector<std::vector<std::vector<char *> > > result_base(graphs.size());
+  for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) {
+    auto graph = graphs.at(graph_index);
+
+    result_base.at(graph_index).resize(graph.max_width);
+
+    for (long point = 0; point < graph.max_width; ++point) {
+
+      result_base.at(graph_index).at(point).resize(num_fields, NULL);
+
+      auto &inst = task_results.at(graph_index).at(point);
+      if (inst.get_location() == gpumem || inst.get_location() == zcopymem) {
+        for (long fid = FID_FIRST; fid < FID_FIRST + num_fields; ++fid) {
+          result_base.at(graph_index).at(point).at(fid - FID_FIRST) =
+            get_base(inst, fid);
+        }
+      }
+    }
+  }
+
+  std::vector<uintptr_t> input_ptr(all_max_deps, 0);
+  std::vector<size_t> input_bytes(all_max_deps, 0);
+  
+  size_t max_scratch_bytes = 0;
+  for (auto graph : graphs) {
+    max_scratch_bytes = std::max(max_scratch_bytes, graph.scratch_bytes_per_task);
+  }
+
+  // It's ok to use only a single scratch buffer because the tasks
+  // will be effectively serialized on this processor.
+  char *scratch_ptr = NULL;
+  cudaHostAlloc((void**)&(scratch_ptr), max_scratch_bytes, 0);
+  assert(scratch_ptr);
+  TaskGraph::prepare_scratch(scratch_ptr, max_scratch_bytes);
+
+  // Statically allocate buffer to use for task input
+  size_t leaf_bufsize = 0;
+  {
+    LeafArgs leaf_args;
+    ByteCountSerializer ser;
+    ser << leaf_args;
+    ser << input_ptr;
+    ser << input_bytes;
+    leaf_bufsize = ser.bytes_used();
+  }
+  void *leaf_buffer = malloc(leaf_bufsize);
+  assert(leaf_buffer);
+
+  // Sync again to avoid staggered start
+  sync.arrive(1);
+  sync.wait();
+  sync = sync.advance_barrier();
+
+  // Main loop
+  unsigned long long start_time = 0, stop_time = 0;
+  std::vector<Event> preconditions;
+  std::vector<std::vector<std::vector<Event> > > copy_postconditions;
+  for (long rep = 0; rep < 1; ++rep) {
+    start_time = Clock::current_time_in_nanoseconds();
+    for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) {
+      auto graph = graphs.at(graph_index);
+
+      std::fill(input_bytes.begin(), input_bytes.end(), graph.output_bytes_per_task);
+
+      long first_point = proc_index * graph.max_width / num_procs;
+      long last_point = (proc_index + 1) * graph.max_width / num_procs - 1;
+
+      // Don't leak copy postconditions between graphs.
+      copy_postconditions.resize(last_point - first_point + 1);
+      for (auto &point_postconditions : copy_postconditions) {
+        point_postconditions.resize(num_fields);
+        for (auto &postconditions : point_postconditions) {
+          postconditions.clear();
+        }
+      }
+
+      for (long timestep = 0; timestep < graph.timesteps; ++timestep) {
+        long dset = graph.dependence_set_at_timestep(timestep);
+        long next_dset = graph.dependence_set_at_timestep(timestep + 1);
+        long last_field_dset = graph.dependence_set_at_timestep(std::max(timestep - num_fields + 1, 0L));
+
+        long offset = graph.offset_at_timestep(timestep);
+        long width = graph.width_at_timestep(timestep);
+
+        long last_offset = graph.offset_at_timestep(timestep-1);
+        long last_width = graph.width_at_timestep(timestep-1);
+
+        long last_field_offset = graph.offset_at_timestep(timestep - num_fields + 1);
+        long last_field_width = graph.width_at_timestep(timestep - num_fields + 1);
+
+        long next_offset = timestep + 1 < graph.timesteps ? graph.offset_at_timestep(timestep+1) : 0;
+        long next_width = timestep + 1 < graph.timesteps ? graph.width_at_timestep(timestep+1) : 0;
+
+        long fid = FID_FIRST + timestep % num_fields;
+        long last_fid = FID_FIRST + (timestep + num_fields - 1) % num_fields;
+
+        for (long point = first_point; point <= last_point; ++point) {
+          // Gather inputs
+          long n_inputs = 0, slot = 0;
+          preconditions.clear();
+          preconditions.insert(preconditions.begin(),
+                               copy_postconditions.at(point - first_point).at(fid - FID_FIRST).begin(),
+                               copy_postconditions.at(point - first_point).at(fid - FID_FIRST).end());
+          const auto &dset_deps = graph.dependencies(dset, point);
+          for (auto interval : dset_deps) {
+            for (long dep = interval.first; dep <= interval.second; ++dep) {
+              Barrier &ready = raw_in.at(graph_index).at(point - first_point).at(last_fid - FID_FIRST).at(dep);
+              preconditions.push_back(ready.get_previous_phase());
+
+              if (dep >= last_offset && dep < last_offset + last_width) {
+                char *data = result_base.at(graph_index).at(dep).at(last_fid - FID_FIRST);
+                if (point >= offset && point < offset + width) {
+                  if (data && !force_copies) {
+                    // Data available locally
+                  } else {
+                    // Data is remote
+                    data = input_base.at(graph_index).at(point - first_point).at(slot).at(last_fid - FID_FIRST);
+                  }
+                }
+                input_ptr.at(n_inputs) = reinterpret_cast<uintptr_t>(data);
+                n_inputs++;
+              }
+              slot++;
+            }
+          }
+
+          // Dependencies can occur in one of two ways:
+          //  1. The dependent task is local, so copy is not necessary.
+          //     (In this case the dependency catches on the task itself.)
+          //  2. The dependent task is non-local, so copy is necessary.
+          //     (In this case the dependency catches on the copy.)
+
+          // WAR dependencies (part 1)
+          for (auto interval : graph.reverse_dependencies(last_field_dset, point)) {
+            for (long dep = interval.first; dep <= interval.second; ++dep) {
+              if (dep >= last_field_offset && dep < last_field_offset + last_field_width) {
+                // Only copy when the dependent task doesn't live in the same address space.
+                if (!force_copies && result_base.at(graph_index).at(dep).at(last_fid - FID_FIRST)) {
+                  Barrier &ready = war_in.at(graph_index).at(point - first_point).at(fid - FID_FIRST).at(dep);
+                  preconditions.push_back(ready.get_previous_phase());
+                }
+              }
+            }
+          }
+
+          // WAR dependencies (part 2)
+          const auto &next_dset_rev_deps = graph.reverse_dependencies(next_dset, point);
+          for (auto interval : next_dset_rev_deps) {
+            for (long dep = interval.first; dep <= interval.second; ++dep) {
+              if (force_copies || !result_base.at(graph_index).at(dep).at(last_fid - FID_FIRST)) {
+                Barrier &ready = war_in.at(graph_index).at(point - first_point).at(fid - FID_FIRST).at(dep);
+                preconditions.push_back(ready.get_previous_phase());
+              }
+            }
+          }
+
+          // Launch task
+          Event task_postcondition = Event::NO_EVENT;
+          if (point >= offset && point < offset + width) {
+            LeafArgs leaf_args;
+            leaf_args.point = point;
+            leaf_args.timestep = timestep;
+            leaf_args.graph = graph;
+            leaf_args.output_ptr = result_base.at(graph_index).at(point).at(fid - FID_FIRST);
+            leaf_args.output_bytes = graph.output_bytes_per_task;
+            leaf_args.scratch_ptr = scratch_ptr;
+            leaf_args.scratch_bytes = graph.scratch_bytes_per_task;
+            leaf_args.n_inputs = n_inputs;
+            leaf_args.device_ptr = device_ptr;
+            leaf_args.device_bytes = device_bytes;
+
+            FixedBufferSerializer ser(leaf_buffer, leaf_bufsize);
+            ser << leaf_args;
+            ser << input_ptr;
+            ser << input_bytes;
+            assert(ser.bytes_left() == 0);
+
+            task_postcondition =
+              p.spawn(LEAF_TASK, leaf_buffer, leaf_bufsize,
+                      Event::merge_events(preconditions));
+
+            // FIXME: Figure out which tasks we actually need to wait on
+            events.push_back(task_postcondition);
+          }
+
+          copy_postconditions.at(point - first_point).at(fid - FID_FIRST).clear();
+
+          // RAW dependencies
+          for (auto interval : next_dset_rev_deps) {
+            for (long dep = interval.first; dep <= interval.second; ++dep) {
+              Barrier &complete = raw_out.at(graph_index).at(point - first_point).at(fid - FID_FIRST).at(dep);
+              Event postcondition = task_postcondition;
+              if (dep >= next_offset && dep < next_offset + next_width) {
+                // Only copy when the dependent task doesn't live in the same address space.
+                if (force_copies || !result_base.at(graph_index).at(dep).at(fid - FID_FIRST)) {
+                  long slot = remote_input_slot.at(graph_index).at(point - first_point).at(next_dset).at(dep);
+                  postcondition = copy(
+                    task_results.at(graph_index).at(point),
+                    remote_inputs
+                      .at(graph_index)
+                      .at(point - first_point)
+                      .at(dep)
+                      .at(slot),
+                    fid, graph.output_bytes_per_task,
+                    task_postcondition);
+                  copy_postconditions.at(point - first_point).at(fid - FID_FIRST).push_back(postcondition);
+                }
+              }
+              complete.arrive(1, postcondition);
+            }
+          }
+          // Also need to arrive at any points not included in this
+          // dset, otherwise we'll deadlock.
+          for (long dep : war_points_not_in_dset.at(graph_index).at(point - first_point).at(next_dset)) {
+            raw_out.at(graph_index).at(point - first_point).at(fid - FID_FIRST).at(dep).arrive(1);
+          }
+
+          // WAR dependencies
+          for (auto interval : dset_deps) {
+            for (long dep = interval.first; dep <= interval.second; ++dep) {
+              Barrier &complete = war_out.at(graph_index).at(point - first_point).at(last_fid - FID_FIRST).at(dep);
+              complete.arrive(1, task_postcondition);
+            }
+          }
+          // Also need to arrive at any points not included in this
+          // dset, otherwise we'll deadlock.
+          for (long dep : raw_points_not_in_dset.at(graph_index).at(point - first_point).at(dset)) {
+            war_out.at(graph_index).at(point - first_point).at(last_fid - FID_FIRST).at(dep).arrive(1, task_postcondition);
+          }
+
+          for (auto &bar : raw_in.at(graph_index).at(point - first_point).at(fid - FID_FIRST)) {
+            bar.second = bar.second.advance_barrier();
+          }
+
+          for (auto &bar : raw_out.at(graph_index).at(point - first_point).at(fid - FID_FIRST)) {
+            bar.second = bar.second.advance_barrier();
+          }
+
+          for (auto &bar : war_in.at(graph_index).at(point - first_point).at(last_fid - FID_FIRST)) {
+            bar.second = bar.second.advance_barrier();
+          }
+
+          for (auto &bar : war_out.at(graph_index).at(point - first_point).at(last_fid - FID_FIRST)) {
+            bar.second = bar.second.advance_barrier();
+          }
+
+        }
+      }
+    }
+
+    Event::merge_events(events).wait();
+    events.clear();
+
+    stop_time = Clock::current_time_in_nanoseconds();
+  }
+
+  first_start.arrive(1, Event::NO_EVENT, &start_time, sizeof(start_time));
+  last_start.arrive(1, Event::NO_EVENT, &start_time, sizeof(start_time));
+  first_stop.arrive(1, Event::NO_EVENT, &stop_time, sizeof(stop_time));
+  last_stop.arrive(1, Event::NO_EVENT, &stop_time, sizeof(stop_time));
+
+  free(leaf_buffer);
+  cudaFreeHost(scratch_ptr);
+
+}
+
+void top_level_task(const void *args, size_t arglen, const void *userdata,
+                    size_t userlen, Processor p)
+{
+  assert(global_argv != NULL);
+  App app(global_argc, global_argv);
+  auto &graphs = app.graphs;
+
+  long num_fields = 5;
+  bool force_copies = false;
+  for (int i = 1; i < global_argc; i++) {
+    if (!strcmp(global_argv[i], "-field")) {
+      long value  = atol(global_argv[++i]);
+      if (value <= 0) {
+        fprintf(stderr, "error: Invalid flag \"-field %ld\" must be > 1\n", value);
+        abort();
+      }
+      num_fields = value;
+    }
+
+    if (!strcmp(global_argv[i], "-force-copies")) {
+      force_copies = true;
+    }
+  }
+
+  app.display();
+
+  Machine machine = Machine::get_machine();
+
+  std::vector<Processor> procs;
+  {
+    Machine::ProcessorQuery query(machine);
+    query.only_kind(Processor::TOC_PROC);
+    procs.insert(procs.end(), query.begin(), query.end());
+  }
+  long num_procs = procs.size();
+
+  std::map<Processor, Memory> proc_zcopymems;
+  std::map<Processor, Memory> proc_gpumems;
+  {
+    std::vector<Machine::ProcessorMemoryAffinity> proc_mem_affinities;
+    machine.get_proc_mem_affinity(proc_mem_affinities);
+
+    for (size_t i = 0; i < proc_mem_affinities.size(); ++i) {
+      Machine::ProcessorMemoryAffinity &affinity = proc_mem_affinities[i];
+      std::cout << "proc: " << i << ", p:" << affinity.p.kind() << ", m:" << affinity.m.kind() << std::endl;
+      if (affinity.p.kind() == Processor::TOC_PROC) {
+        if (affinity.m.kind() == Memory::Z_COPY_MEM) {
+          proc_zcopymems[affinity.p] = affinity.m;
+        } else if (affinity.m.kind() == Memory::GPU_FB_MEM)
+          proc_gpumems[affinity.p] = affinity.m;
+      }
+    }
+  }
+
+  long all_max_deps = 0;
+  for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) {
+    auto graph = graphs.at(graph_index);
+
+    for (long point = 0; point <= graph.max_width; ++point) {
+      long max_dset = graph.max_dependence_sets();
+      for (long dset = 0; dset < max_dset; ++dset) {
+        long deps = 0;
+        for (auto interval : graph.dependencies(dset, point)) {
+          deps += interval.second - interval.first + 1;
+        }
+        all_max_deps = std::max(all_max_deps, deps);
+      }
+    }
+  }
+
+  std::vector<Event> events;
+  std::vector<std::vector<RegionInstance> > task_results(graphs.size());
+  {
+    std::map<FieldID, size_t> field_sizes;
+    for (unsigned fid = FID_FIRST; fid < FID_FIRST + num_fields; ++fid) {
+      field_sizes[fid] = sizeof(char);
+    }
+
+    for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) {
+      auto graph = graphs.at(graph_index);
+
+      task_results.at(graph_index).resize(graph.max_width);
+
+      Rect1 bounds(Point1(0), Point1(graph.output_bytes_per_task - 1));
+
+      for (long proc_index = 0; proc_index < num_procs; ++proc_index) {
+        auto proc = procs.at(proc_index);
+        auto memory = proc_zcopymems.at(proc);
+
+        long first_point = proc_index * graph.max_width / num_procs;
+        long last_point = (proc_index + 1) * graph.max_width / num_procs - 1;
+        for (long point = first_point; point <= last_point; ++point) {
+          events.push_back(
+            RegionInstance::create_instance(task_results.at(graph_index).at(point), memory, bounds, field_sizes,
+                                            0 /*SOA*/, ProfilingRequestSet()));
+        }
+      }
+    }
+  }
+
+  std::vector<std::vector<RegionInstance> > task_inputs(graphs.size());
+  {
+    std::map<FieldID, size_t> field_sizes;
+    field_sizes[FID_FIRST] = sizeof(RegionInstance);
+
+    for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) {
+      auto graph = graphs.at(graph_index);
+
+      task_inputs.at(graph_index).resize(graph.max_width);
+
+      Rect1 bounds(Point1(0), Point1(all_max_deps - 1));
+
+      for (long proc_index = 0; proc_index < num_procs; ++proc_index) {
+        auto proc = procs.at(proc_index);
+        auto memory = proc_zcopymems.at(proc);
+
+        long first_point = proc_index * graph.max_width / num_procs;
+        long last_point = (proc_index + 1) * graph.max_width / num_procs - 1;
+        for (long point = first_point; point <= last_point; ++point) {
+          events.push_back(
+            RegionInstance::create_instance(task_inputs.at(graph_index).at(point), memory, bounds, field_sizes,
+                                            0 /*SOA*/, ProfilingRequestSet()));
+        }
+      }
+    }
+  }
+
+  std::vector<std::vector<RegionInstance> > raw_exchange(graphs.size());
+  std::vector<std::vector<RegionInstance> > war_exchange(graphs.size());
+  {
+    std::map<FieldID, size_t> field_sizes;
+    for (unsigned fid = FID_FIRST; fid < FID_FIRST + num_fields; ++fid) {
+      field_sizes[fid] = sizeof(Barrier);
+    }
+
+    for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) {
+      auto graph = graphs.at(graph_index);
+
+      raw_exchange.at(graph_index).resize(graph.max_width);
+      war_exchange.at(graph_index).resize(graph.max_width);
+
+      Rect1 bounds(Point1(0), Point1(graph.max_width - 1));
+
+      for (long proc_index = 0; proc_index < num_procs; ++proc_index) {
+        auto proc = procs.at(proc_index);
+        auto memory = proc_zcopymems.at(proc);
+
+        long first_point = proc_index * graph.max_width / num_procs;
+        long last_point = (proc_index + 1) * graph.max_width / num_procs - 1;
+        for (long point = first_point; point <= last_point; ++point) {
+          events.push_back(
+            RegionInstance::create_instance(raw_exchange.at(graph_index).at(point), memory, bounds, field_sizes,
+                                            0 /*SOA*/, ProfilingRequestSet()));
+          events.push_back(
+            RegionInstance::create_instance(war_exchange.at(graph_index).at(point), memory, bounds, field_sizes,
+                                            0 /*SOA*/, ProfilingRequestSet()));
+        }
+      }
+    }
+  }
+
+  Event::merge_events(events).wait();
+  events.clear();
+
+  Barrier sync_bar = Barrier::create_barrier(num_procs);
+
+  Barrier first_start_bar =
+      Barrier::create_barrier(num_procs, REDOP_MIN, &RedopMin::identity,
+                              sizeof(RedopMin::identity));
+  Barrier last_start_bar =
+      Barrier::create_barrier(num_procs, REDOP_MAX, &RedopMax::identity,
+                              sizeof(RedopMax::identity));
+  Barrier first_stop_bar =
+      Barrier::create_barrier(num_procs, REDOP_MIN, &RedopMin::identity,
+                              sizeof(RedopMin::identity));
+  Barrier last_stop_bar =
+      Barrier::create_barrier(num_procs, REDOP_MAX, &RedopMax::identity,
+                              sizeof(RedopMax::identity));
+
+  for (long proc_index = 0; proc_index < num_procs; ++proc_index) {
+    auto proc = procs.at(proc_index);
+
+    ShardArgs args;
+    args.proc_index = proc_index;
+    args.num_procs = num_procs;
+    args.num_fields = num_fields;
+    args.force_copies = force_copies;
+    args.gpumem = proc_gpumems[proc];
+    args.zcopymem = proc_zcopymems[proc];
+    args.sync = sync_bar;
+    args.first_start = first_start_bar;
+    args.last_start = last_start_bar;
+    args.first_stop = first_stop_bar;
+    args.last_stop = last_stop_bar;
+
+    DynamicBufferSerializer ser(4096);
+    ser << args;
+    ser << graphs;
+    ser << task_results;
+    ser << task_inputs;
+    ser << raw_exchange;
+    ser << war_exchange;
+    events.push_back(
+      proc.spawn(SHARD_TASK, ser.get_buffer(), ser.bytes_used()));
+  }
+
+  Event::merge_events(events).wait();
+  events.clear();
+
+  unsigned long long first_start;
+  {
+    first_start_bar.wait();
+#ifndef NDEBUG
+    bool ok =
+#endif
+      first_start_bar.get_result(&first_start, sizeof(first_start));
+    assert(ok);
+  }
+
+  unsigned long long last_start;
+  {
+    last_start_bar.wait();
+#ifndef NDEBUG
+    bool ok =
+#endif
+      last_start_bar.get_result(&last_start, sizeof(last_start));
+    assert(ok);
+  }
+
+  unsigned long long first_stop;
+  {
+    first_stop_bar.wait();
+#ifndef NDEBUG
+    bool ok =
+#endif
+      first_stop_bar.get_result(&first_stop, sizeof(first_stop));
+    assert(ok);
+  }
+
+  unsigned long long last_stop;
+  {
+    last_stop_bar.wait();
+#ifndef NDEBUG
+    bool ok =
+#endif
+      last_stop_bar.get_result(&last_stop, sizeof(last_stop));
+    assert(ok);
+  }
+
+  app.report_timing((last_stop - first_start)/1e9);
+}
+
+int main(int argc, char **argv)
+{
+  sleep(20);
+  Runtime rt;
+
+  rt.init(&argc, &argv);
+
+  rt.register_task(TOP_LEVEL_TASK, top_level_task);
+  rt.register_task(SHARD_TASK, shard_task);
+  rt.register_task(LEAF_TASK, leaf_task);
+  rt.register_reduction<RedopMin>(REDOP_MIN);
+  rt.register_reduction<RedopMax>(REDOP_MAX);
+
+  // select a processor to run the top level task on
+  Processor p = Processor::NO_PROC;
+  {
+    Machine::ProcessorQuery query(Machine::get_machine());
+    query.only_kind(Processor::LOC_PROC);
+    p = query.first();
+  }
+  assert(p.exists());
+
+  global_argc = argc;
+  global_argv = argv;
+
+  // collective launch of a single task - everybody gets the same finish event
+  Event e = rt.collective_spawn(p, TOP_LEVEL_TASK, NULL, 0);
+  // request shutdown once that task is complete
+  rt.shutdown(e);
+
+  // now sleep this thread until that shutdown actually happens
+  rt.wait_for_shutdown();
+
+  return 0;
+}
diff --git a/realm_gpu/main.h b/realm_gpu/main.h
new file mode 100644
index 00000000..f7a1dfd7
--- /dev/null
+++ b/realm_gpu/main.h
@@ -0,0 +1,70 @@
+/* Copyright 2020 Stanford University
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MAIN_H
+#define MAIN_H
+
+#include "realm.h"
+
+#include "core.h"
+#include "cuda_kernel.h"
+
+typedef long long int coord_t;
+
+typedef Realm::Point<1, coord_t> Point1;
+typedef Realm::Rect<1, coord_t> Rect1;
+
+typedef Realm::Point<2, coord_t> Point2;
+typedef Realm::Rect<2, coord_t> Rect2;
+
+struct ShardArgs {
+public:
+  long proc_index;
+  long num_procs;
+  long num_fields;
+  long force_copies;
+  Realm::Memory gpumem;
+  Realm::Memory zcopymem;
+  Realm::Barrier sync;
+  Realm::Barrier first_start;
+  Realm::Barrier last_start;
+  Realm::Barrier first_stop;
+  Realm::Barrier last_stop;
+  // std::vector<TaskGraph> graphs;
+  // std::vector<std::vector<RegionInstance> > task_results;
+  // std::vector<std::vector<RegionInstance> > raw_exchange;
+  // std::vector<std::vector<RegionInstance> > war_exchange;
+};
+
+TYPE_IS_SERIALIZABLE(ShardArgs);
+
+struct LeafArgs {
+public:
+  long point, timestep;
+  TaskGraph graph;
+  char *output_ptr;
+  size_t output_bytes;
+  char *scratch_ptr;
+  size_t scratch_bytes;
+  long n_inputs;
+  char *device_ptr;
+  size_t device_bytes;
+  // std::vector<uintptr_t> input_ptr;
+  // std::vector<size_t> input_bytes;
+};
+
+TYPE_IS_SERIALIZABLE(LeafArgs);
+
+#endif
diff --git a/realm_gpu/make_cuda_realm.mk b/realm_gpu/make_cuda_realm.mk
new file mode 100644
index 00000000..c2eae0cc
--- /dev/null
+++ b/realm_gpu/make_cuda_realm.mk
@@ -0,0 +1,13 @@
+HAVE_CUDA ?= 0
+
+ifeq ($(strip $(HAVE_CUDA)),1)
+ifndef CUDA_ROOT
+$(error CUDA_ROOT variable is not defined, aborting build)
+endif
+
+CFLAGS    += -DENABLE_CUDA
+CXXFLAGS  += -DENABLE_CUDA
+LDFLAGS 	+= -L../core -lcore_cuda_s -L$(CUDA_ROOT)/lib64
+
+
+endif
diff --git a/realm_gpu/readme b/realm_gpu/readme
new file mode 100644
index 00000000..34b7e009
--- /dev/null
+++ b/realm_gpu/readme
@@ -0,0 +1,5 @@
+export HAVE_CUDA=1
+export CUDA_ROOT=/path/to/cuda
+export DEBUG=1
+export USE_CUDA=1
+export LG_RT_DIR=/path/to/legion/runtime