StanfordLegion · dycz0fx · Jul 9, 2020 · Jul 9, 2020 · Jul 9, 2020 · Jul 21, 2020
diff --git a/core/core.cc b/core/core.cc
@@ -80,11 +80,42 @@ void Kernel::execute(long graph_index, long timestep, long point,
     assert(timestep >= 0 && point >= 0);
     execute_kernel_imbalance(*this, graph_index, timestep, point);
     break;
+  default:
+    assert(false && "unimplemented kernel type");
+  };
+}
+
+void Kernel::execute(long graph_index, long timestep, long point,
+                     char *scratch_ptr, size_t scratch_bytes, int gpu_id) const
+{
+  switch(type) {
+  case KernelType::EMPTY:
+    execute_kernel_empty(*this);
+    break;
 #ifdef ENABLE_CUDA
   case KernelType::CUDA_COMPUTE_BOUND:
     assert(scratch_ptr != NULL);
     assert(scratch_bytes > 0);
-    execute_kernel_compute_cuda(*this, scratch_ptr, scratch_bytes);
+    execute_kernel_compute_cuda(*this, scratch_ptr, scratch_bytes, gpu_id);
+    break; 
+#endif
+  default:
+    assert(false && "unimplemented kernel type");
+  };
+}
+
+void Kernel::execute(long graph_index, long timestep, long point,
+                     char *scratch_ptr, size_t scratch_bytes, char *device_ptr, size_t device_bytes) const
+{
+  switch(type) {
+  case KernelType::EMPTY:
+    execute_kernel_empty(*this);
+    break;
+#ifdef ENABLE_CUDA
+  case KernelType::CUDA_COMPUTE_BOUND:
+    assert(scratch_ptr != NULL);
+    assert(scratch_bytes > 0);
+    execute_kernel_compute_cuda(*this, scratch_ptr, scratch_bytes, device_ptr, device_bytes);
     break; 
 #endif
   default:
@@ -633,6 +664,156 @@ void TaskGraph::execute_point(long timestep, long point,
   k.execute(graph_index, timestep, point, scratch_ptr, scratch_bytes);
 }
 
+void TaskGraph::execute_point(long timestep, long point,
+                              char *output_ptr, size_t output_bytes,
+                              const char **input_ptr, const size_t *input_bytes,
+                              size_t n_inputs,
+                              char *scratch_ptr, size_t scratch_bytes,
+                              int gpu_id) const
+{
+#ifdef DEBUG_CORE
+  // Validate graph_index
+  assert(graph_index >= 0 && graph_index < sizeof(TaskGraphMask)*8);
+  has_executed_graph |= 1 << graph_index;
+#endif
+
+  // Validate timestep and point
+  assert(0 <= timestep && timestep < timesteps);
+
+  long offset = offset_at_timestep(timestep);
+  long width = width_at_timestep(timestep);
+  assert(offset <= point && point < offset+width);
+
+  long last_offset = offset_at_timestep(timestep-1);
+  long last_width = width_at_timestep(timestep-1);
+
+  // Validate input
+  {
+    size_t idx = 0;
+    long dset = dependence_set_at_timestep(timestep);
+    size_t max_deps = num_dependencies(dset, point);
+    std::pair<long, long> *deps = reinterpret_cast<std::pair<long, long> *>(alloca(sizeof(std::pair<long, long>) * max_deps));
+    size_t num_deps = dependencies(dset, point, deps);
+    for (size_t span = 0; span < num_deps; span++) {
+      for (long dep = deps[span].first; dep <= deps[span].second; dep++) {
+        if (last_offset <= dep && dep < last_offset + last_width) {
+          assert(idx < n_inputs);
+
+          assert(input_bytes[idx] == output_bytes_per_task);
+          assert(input_bytes[idx] >= sizeof(std::pair<long, long>));
+
+          const std::pair<long, long> *input = reinterpret_cast<const std::pair<long, long> *>(input_ptr[idx]);
+          for (size_t i = 0; i < input_bytes[idx]/sizeof(std::pair<long, long>); ++i) {
+            assert(input[i].first == timestep - 1);
+            assert(input[i].second == dep);
+          }
+          idx++;
+        }
+      }
+    }
+    // FIXME (Elliott): Legion is currently passing in uninitialized
+    // memory for dependencies outside of the last offset/width.
+    // assert(idx == n_inputs);
+  }
+
+  // Validate output
+  assert(output_bytes == output_bytes_per_task);
+  assert(output_bytes >= sizeof(std::pair<long, long>));
+
+  // Generate output
+  std::pair<long, long> *output = reinterpret_cast<std::pair<long, long> *>(output_ptr);
+  for (size_t i = 0; i < output_bytes/sizeof(std::pair<long, long>); ++i) {
+    output[i].first = timestep;
+    output[i].second = point;
+  }
+
+  // Validate scratch
+  assert(scratch_bytes == scratch_bytes_per_task);
+  if (scratch_bytes > 0) {
+    uint64_t *scratch = reinterpret_cast<uint64_t *>(scratch_ptr);
+    assert(*scratch == MAGIC_VALUE);
+  }
+
+  // Execute kernel
+  Kernel k(kernel);
+  k.execute(graph_index, timestep, point, scratch_ptr, scratch_bytes, gpu_id);
+}
+
+void TaskGraph::execute_point(long timestep, long point,
+                              char *output_ptr, size_t output_bytes,
+                              const char **input_ptr, const size_t *input_bytes,
+                              size_t n_inputs,
+                              char *scratch_ptr, size_t scratch_bytes,
+                              char *device_ptr, size_t device_bytes) const
+{
+#ifdef DEBUG_CORE
+  // Validate graph_index
+  assert(graph_index >= 0 && graph_index < sizeof(TaskGraphMask)*8);
+  has_executed_graph |= 1 << graph_index;
+#endif
+
+  // Validate timestep and point
+  assert(0 <= timestep && timestep < timesteps);
+
+  long offset = offset_at_timestep(timestep);
+  long width = width_at_timestep(timestep);
+  assert(offset <= point && point < offset+width);
+
+  long last_offset = offset_at_timestep(timestep-1);
+  long last_width = width_at_timestep(timestep-1);
+
+  // Validate input
+  {
+    size_t idx = 0;
+    long dset = dependence_set_at_timestep(timestep);
+    size_t max_deps = num_dependencies(dset, point);
+    std::pair<long, long> *deps = reinterpret_cast<std::pair<long, long> *>(alloca(sizeof(std::pair<long, long>) * max_deps));
+    size_t num_deps = dependencies(dset, point, deps);
+    for (size_t span = 0; span < num_deps; span++) {
+      for (long dep = deps[span].first; dep <= deps[span].second; dep++) {
+        if (last_offset <= dep && dep < last_offset + last_width) {
+          assert(idx < n_inputs);
+
+          assert(input_bytes[idx] == output_bytes_per_task);
+          assert(input_bytes[idx] >= sizeof(std::pair<long, long>));
+
+          const std::pair<long, long> *input = reinterpret_cast<const std::pair<long, long> *>(input_ptr[idx]);
+          for (size_t i = 0; i < input_bytes[idx]/sizeof(std::pair<long, long>); ++i) {
+            assert(input[i].first == timestep - 1);
+            assert(input[i].second == dep);
+          }
+          idx++;
+        }
+      }
+    }
+    // FIXME (Elliott): Legion is currently passing in uninitialized
+    // memory for dependencies outside of the last offset/width.
+    // assert(idx == n_inputs);
+  }
+
+  // Validate output
+  assert(output_bytes == output_bytes_per_task);
+  assert(output_bytes >= sizeof(std::pair<long, long>));
+
+  // Generate output
+  std::pair<long, long> *output = reinterpret_cast<std::pair<long, long> *>(output_ptr);
+  for (size_t i = 0; i < output_bytes/sizeof(std::pair<long, long>); ++i) {
+    output[i].first = timestep;
+    output[i].second = point;
+  }
+
+  // Validate scratch
+  assert(scratch_bytes == scratch_bytes_per_task);
+  if (scratch_bytes > 0) {
+    uint64_t *scratch = reinterpret_cast<uint64_t *>(scratch_ptr);
+    assert(*scratch == MAGIC_VALUE);
+  }
+
+  // Execute kernel
+  Kernel k(kernel);
+  k.execute(graph_index, timestep, point, scratch_ptr, scratch_bytes, device_ptr, device_bytes);
+}
+
 void TaskGraph::prepare_scratch(char *scratch_ptr, size_t scratch_bytes)
 {
   assert(scratch_bytes % sizeof(uint64_t) == 0);
@@ -654,7 +835,7 @@ static TaskGraph default_graph(long graph_index)
   graph.period = -1;
   graph.fraction_connected = 0.25;
 #ifdef ENABLE_CUDA
-  graph.kernel = {KernelType::EMPTY, 0, 16, 0.0, 1, 32, 0, 0, 1};
+  graph.kernel = {KernelType::EMPTY, 0, 16, 0.0, 1, 32, 0, 1};
 #else
   graph.kernel = {KernelType::EMPTY, 0, 16, 0.0};
 #endif
@@ -962,9 +1143,6 @@ App::App(int argc, char **argv)
 
   check();
 
-#ifdef ENABLE_CUDA
-  init_cuda_support(graphs);
-#endif
 }
 
 void App::check() const
@@ -1253,8 +1431,5 @@ void App::report_timing(double elapsed_seconds) const
 #ifdef DEBUG_CORE
   printf("Task Graph Execution Mask %llx\n", has_executed_graph.load());
 #endif
-
-#ifdef ENABLE_CUDA
-  fini_cuda_support();
-#endif
+
 }
diff --git a/core/core.h b/core/core.h
@@ -34,6 +34,10 @@ struct Kernel : public kernel_t {
 private:
   void execute(long graph_index, long timestep, long point,
                char *scratch_ptr, size_t scratch_bytes) const;
+  void execute(long graph_index, long timestep, long point,
+               char *scratch_ptr, size_t scratch_bytes, int gpu_id) const;
+  void execute(long graph_index, long timestep, long point,
+               char *scratch_ptr, size_t scratch_bytes, char* device_ptr, size_t device_bytes) const;
   friend struct TaskGraph;
 };
 
@@ -69,6 +73,18 @@ struct TaskGraph : public task_graph_t {
                      const char **input_ptr, const size_t *input_bytes,
                      size_t n_inputs,
                      char *scratch_ptr, size_t scratch_bytes) const;
+  void execute_point(long timestep, long point,
+                     char *output_ptr, size_t output_bytes,
+                     const char **input_ptr, const size_t *input_bytes,
+                     size_t n_inputs,
+                     char *scratch_ptr, size_t scratch_bytes,
+                     int gpu_id) const;
+  void execute_point(long timestep, long point,
+                     char *output_ptr, size_t output_bytes,
+                     const char **input_ptr, const size_t *input_bytes,
+                     size_t n_inputs,
+                     char *scratch_ptr, size_t scratch_bytes,
+                     char *device_ptr, size_t device_bytes) const;
   static void prepare_scratch(char *scratch_ptr, size_t scratch_bytes);
 };
 

diff --git a/core/core_c.h b/core/core_c.h
@@ -63,7 +63,6 @@ typedef struct kernel_t {
   int nb_blocks;
   int threads_per_block;
   int memcpy_required;
-  int gpu_id;
   int cuda_unroll;
 #endif
 } kernel_t;

diff --git a/core/cuda_kernel.cu b/core/cuda_kernel.cu
@@ -3,7 +3,6 @@
 #include "cuda_kernel.h"
 
 std::vector<char*> local_buffer;
-int nb_local_buffer = 0;
 size_t local_buffer_size;
 cudaStream_t cuda_stream_array[8];
 
@@ -27,61 +26,88 @@ __global__ void execute_kernel_compute_cuda_kernel_unroll_4(long iter, double *A
 __global__ void execute_kernel_compute_cuda_kernel_unroll_8(long iter, double *A);
 __global__ void execute_kernel_compute_cuda_kernel_unroll_16(long iter, double *A);
 
-void init_cuda_support(const std::vector<TaskGraph> &graphs)
+void init_cuda_support(const std::vector<TaskGraph> &graphs, const std::vector<int> &local_gpus)
 {
-  int nb_gpus = 1;
+  int nb_gpus = local_gpus.size();
 
-  nb_local_buffer = nb_gpus;
-  local_buffer.reserve(nb_local_buffer);
+  local_buffer.reserve(nb_gpus);
   int nb_blocks = graphs[0].kernel.nb_blocks;
   int threads_per_block = graphs[0].kernel.threads_per_block;
   int cuda_unroll = graphs[0].kernel.cuda_unroll;
-  printf("init cuda support nb_blocks %d, threads_per_block %d, cuda_unroll %d\n", nb_blocks, threads_per_block, cuda_unroll);
+  // printf("init cuda support nb_gpus %d nb_blocks %d, threads_per_block %d, cuda_unroll %d\n", nb_gpus, nb_blocks, threads_per_block, cuda_unroll);
   local_buffer_size = nb_blocks * threads_per_block * sizeof(double);
   for (int i = 0; i < nb_gpus; i++) {
-    gpuErrchk( cudaSetDevice(0) );
+    gpuErrchk( cudaSetDevice(local_gpus[i]) );
     gpuErrchk( cudaMalloc((void**)&(local_buffer[i]), sizeof(double) * nb_blocks * threads_per_block * cuda_unroll) );
     assert(local_buffer[i] != NULL);
     gpuErrchk( cudaStreamCreate(&(cuda_stream_array[i])) );
   }
 }
 
-void fini_cuda_support()
+void fini_cuda_support(const std::vector<int> &local_gpus)
 {
-  for (int i = 0; i < nb_local_buffer; i++) {
-    gpuErrchk( cudaSetDevice(0) );
+  // printf("fini cuda support nb_gpus %d\n", local_gpus.size());
+  for (int i = 0; i < local_buffer.size(); i++) {
+    gpuErrchk( cudaSetDevice(local_gpus[i]) );
     gpuErrchk( cudaFree(local_buffer[i]) );
     local_buffer[i] = NULL;
     gpuErrchk( cudaStreamDestroy(cuda_stream_array[i]) );
   }
+  local_buffer.clear();
 }
 
-void execute_kernel_compute_cuda(const Kernel &kernel, char *scratch_ptr, size_t scratch_bytes)
+void execute_kernel_compute_cuda(const Kernel &kernel, char *scratch_ptr, size_t scratch_bytes, int gpu_id)
 {
-//  printf("CUDA COMPUTE KERNEL buffer %p, size %lld, nb_blocks %d, threads_per_block %d\n", scratch_ptr, scratch_bytes, kernel.nb_blocks, kernel.threads_per_block);
+  // printf("CUDA COMPUTE KERNEL buffer %p, size %lld, nb_blocks %d, threads_per_block %d\n", scratch_ptr, scratch_bytes, kernel.nb_blocks, kernel.threads_per_block);
   assert(scratch_bytes <= local_buffer_size);
-  assert(kernel.gpu_id == 0);
-
+
+  if (kernel.memcpy_required == 1) {
+ //   printf("enable memcpy in\n");
+    gpuErrchk( cudaMemcpyAsync(local_buffer[gpu_id], scratch_ptr, scratch_bytes, cudaMemcpyHostToDevice, cuda_stream_array[gpu_id]) ); 
+    gpuErrchk( cudaStreamSynchronize(cuda_stream_array[gpu_id]) );
+  }
+  if (kernel.cuda_unroll == 4) {
+    execute_kernel_compute_cuda_kernel_unroll_4<<<kernel.nb_blocks, kernel.threads_per_block, 0, cuda_stream_array[gpu_id]>>>(kernel.iterations, (double *)local_buffer[gpu_id]);
+  } else if (kernel.cuda_unroll == 8) {
+    execute_kernel_compute_cuda_kernel_unroll_8<<<kernel.nb_blocks, kernel.threads_per_block, 0, cuda_stream_array[gpu_id]>>>(kernel.iterations, (double *)local_buffer[gpu_id]);
+  } else if (kernel.cuda_unroll == 16) {
+    execute_kernel_compute_cuda_kernel_unroll_16<<<kernel.nb_blocks, kernel.threads_per_block, 0, cuda_stream_array[gpu_id]>>>(kernel.iterations, (double *)local_buffer[gpu_id]);
+  } else {
+    execute_kernel_compute_cuda_kernel_unroll_1<<<kernel.nb_blocks, kernel.threads_per_block, 0, cuda_stream_array[gpu_id]>>>(kernel.iterations, (double *)local_buffer[gpu_id]);
+  }
+  gpuErrchk( cudaPeekAtLastError() );
+  gpuErrchk( cudaStreamSynchronize(cuda_stream_array[gpu_id]) );
+  if (kernel.memcpy_required == 1) {
+ //   printf("enable memcpy out\n");
+    gpuErrchk( cudaMemcpyAsync(scratch_ptr, local_buffer[gpu_id], scratch_bytes, cudaMemcpyDeviceToHost, cuda_stream_array[gpu_id]) );
+    gpuErrchk( cudaStreamSynchronize(cuda_stream_array[gpu_id]) );
+  }
+}
+
+void execute_kernel_compute_cuda(const Kernel &kernel, char *scratch_ptr, size_t scratch_bytes, char *device_ptr, size_t device_bytes) {
+  printf("CUDA COMPUTE KERNEL scratch_ptr %p, size %lld, nb_blocks %d, threads_per_block %d, device_ptr %p, size %lld\n", scratch_ptr, scratch_bytes, kernel.nb_blocks, kernel.threads_per_block, device_ptr, device_bytes);
+  assert(scratch_bytes <= device_bytes);
+
   if (kernel.memcpy_required == 1) {
  //   printf("enable memcpy in\n");
-    gpuErrchk( cudaMemcpyAsync(local_buffer[kernel.gpu_id], scratch_ptr, scratch_bytes, cudaMemcpyHostToDevice, cuda_stream_array[kernel.gpu_id]) ); 
-    gpuErrchk( cudaStreamSynchronize(cuda_stream_array[kernel.gpu_id]) );
+    gpuErrchk( cudaMemcpyAsync(device_ptr, scratch_ptr, scratch_bytes, cudaMemcpyHostToDevice, 0) ); 
+    gpuErrchk( cudaStreamSynchronize(0) );
   }
   if (kernel.cuda_unroll == 4) {
-    execute_kernel_compute_cuda_kernel_unroll_4<<<kernel.nb_blocks, kernel.threads_per_block, 0, cuda_stream_array[kernel.gpu_id]>>>(kernel.iterations, (double *)local_buffer[kernel.gpu_id]);
+    execute_kernel_compute_cuda_kernel_unroll_4<<<kernel.nb_blocks, kernel.threads_per_block, 0, 0>>>(kernel.iterations, (double *)device_ptr);
   } else if (kernel.cuda_unroll == 8) {
-    execute_kernel_compute_cuda_kernel_unroll_8<<<kernel.nb_blocks, kernel.threads_per_block, 0, cuda_stream_array[kernel.gpu_id]>>>(kernel.iterations, (double *)local_buffer[kernel.gpu_id]);
+    execute_kernel_compute_cuda_kernel_unroll_8<<<kernel.nb_blocks, kernel.threads_per_block, 0, 0>>>(kernel.iterations, (double *)device_ptr);
   } else if (kernel.cuda_unroll == 16) {
-    execute_kernel_compute_cuda_kernel_unroll_16<<<kernel.nb_blocks, kernel.threads_per_block, 0, cuda_stream_array[kernel.gpu_id]>>>(kernel.iterations, (double *)local_buffer[kernel.gpu_id]);
+    execute_kernel_compute_cuda_kernel_unroll_16<<<kernel.nb_blocks, kernel.threads_per_block, 0, 0>>>(kernel.iterations, (double *)device_ptr);
   } else {
-    execute_kernel_compute_cuda_kernel_unroll_1<<<kernel.nb_blocks, kernel.threads_per_block, 0, cuda_stream_array[kernel.gpu_id]>>>(kernel.iterations, (double *)local_buffer[kernel.gpu_id]);
+    execute_kernel_compute_cuda_kernel_unroll_1<<<kernel.nb_blocks, kernel.threads_per_block, 0, 0>>>(kernel.iterations, (double *)device_ptr);
   }
   gpuErrchk( cudaPeekAtLastError() );
-  gpuErrchk( cudaStreamSynchronize(cuda_stream_array[kernel.gpu_id]) );
+  gpuErrchk( cudaStreamSynchronize(0) );
   if (kernel.memcpy_required == 1) {
  //   printf("enable memcpy out\n");
-    gpuErrchk( cudaMemcpyAsync(scratch_ptr, local_buffer[kernel.gpu_id], scratch_bytes, cudaMemcpyDeviceToHost, cuda_stream_array[kernel.gpu_id]) );
-    gpuErrchk( cudaStreamSynchronize(cuda_stream_array[kernel.gpu_id]) );
+    gpuErrchk( cudaMemcpyAsync(scratch_ptr, device_ptr, scratch_bytes, cudaMemcpyDeviceToHost, 0) );
+    gpuErrchk( cudaStreamSynchronize(0) );
   }
 }