uniform and coherent grid mostly completed; likely a grid indexing pr…

…oblem causing boids to move towards one corner
CIS565-Fall-2016 · austinEng · Sep 5, 2016 · Sep 5, 2016 · Sep 8, 2016 · Sep 11, 2016
commit d969cc7bb923863685ef39b0b158c98d0d61232d
diff --git a/src/kernel.cu b/src/kernel.cu
@@ -85,6 +85,7 @@ int *dev_gridCellEndIndices;   // to this cell?
 
 // TODO-2.3 - consider what additional buffers you might need to reshuffle
 // the position and velocity data to be coherent within cells.
+glm::vec3 *dev_pos2;
 
 // LOOK-2.1 - Grid parameters based on simulation parameters.
 // These are automatically computed for you in Boids::initSimulation
@@ -162,13 +163,29 @@ void Boids::initSimulation(int N) {
   gridSideCount = 2 * halfSideCount;
 
   gridCellCount = gridSideCount * gridSideCount * gridSideCount;
+  printf("%i CELLS\n", gridCellCount);
   gridInverseCellWidth = 1.0f / gridCellWidth;
   float halfGridWidth = gridCellWidth * halfSideCount;
   gridMinimum.x -= halfGridWidth;
   gridMinimum.y -= halfGridWidth;
   gridMinimum.z -= halfGridWidth;
 
   // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+  cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!");
+
+  cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!");
+
+  cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!");
+
+  cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!");
+
+  cudaMalloc((void**)&dev_pos2, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_pos2 failed!");
+
   cudaThreadSynchronize();
 }
 
@@ -318,10 +335,23 @@ __device__ int gridIndex3Dto1D(int x, int y, int z, int gridResolution) {
 __global__ void kernComputeIndices(int N, int gridResolution,
   glm::vec3 gridMin, float inverseCellWidth,
   glm::vec3 *pos, int *indices, int *gridIndices) {
-    // TODO-2.1
-    // - Label each boid with the index of its grid cell.
-    // - Set up a parallel array of integer indices as pointers to the actual
-    //   boid data in pos and vel1/vel2
+  // TODO-2.1
+  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (index >= N) {
+    return;
+  }
+
+  // - Label each boid with the index of its grid cell.
+  //int xIdx = (pos[index].x - gridMin.x) * inverseCellWidth;
+  //int yIdx = (pos[index].y - gridMin.y) * inverseCellWidth;
+  //int zIdx = (pos[index].z - gridMin.z) * inverseCellWidth;
+  glm::ivec3 index3D = (pos[index] - gridMin) * inverseCellWidth;
+
+  gridIndices[index] = gridIndex3Dto1D(index3D.x, index3D.y, index3D.z, gridResolution);
+
+  // - Set up a parallel array of integer indices as pointers to the actual
+  //   boid data in pos and vel1/vel2
+  indices[index] = index;
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
@@ -336,9 +366,51 @@ __global__ void kernResetIntBuffer(int N, int *intBuffer, int value) {
 __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
   int *gridCellStartIndices, int *gridCellEndIndices) {
   // TODO-2.1
+  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (index >= N || index < 1) { // ignore 0 index b.c. we look at [index-1, index]
+    return;
+  }
   // Identify the start point of each cell in the gridIndices array.
   // This is basically a parallel unrolling of a loop that goes
   // "this index doesn't match the one before it, must be a new cell!"
+  int gIdxPrev = particleGridIndices[index - 1];
+  int gIdx = particleGridIndices[index];
+
+  if (gIdxPrev != gIdx) {
+    gridCellEndIndices[gIdxPrev] = gridCellStartIndices[gIdx] = index;
+  }
+}
+
+__device__ void getGridIndexAndQuadrantMask(glm::vec3 pos, glm::vec3 gridMin, float inverseCellWidth, glm::ivec3 &gIdx, unsigned char &mask) {
+  glm::vec3 partialIndex3D = (pos - gridMin) * inverseCellWidth;
+  gIdx = partialIndex3D;
+  partialIndex3D = partialIndex3D - glm::floor(partialIndex3D);
+
+  mask = ((partialIndex3D.x >= 0.5f) << 0) ||
+  ((partialIndex3D.y >= 0.5f) << 1) ||
+  ((partialIndex3D.z >= 0.5f) << 2);
+}
+
+__device__ int getNeighborCells(int neighborCells[8], const glm::ivec3 &gIdx, const unsigned char &mask, int gridResolution) {
+  int nCells = 0;
+  for (int k = -1; k <= 1; ++k) {
+    for (int j = -1; j <= 1; ++j) {
+      for (int i = -1; i <= 1; ++i) {
+        if (!(
+          (i == -1 && (mask & 0x1)) || (i == 1 && !(mask & 0x1)) ||
+          (j == -1 && (mask & 0x2)) || (j == 1 && !(mask & 0x2)) ||
+          (k == -1 && (mask & 0x4)) || (k == 1 && !(mask & 0x4))
+          )) {
+          glm::ivec3 idx = gIdx + glm::ivec3(i, j, k);
+          if (!glm::any(glm::lessThan(idx, glm::ivec3(0, 0, 0))) &&
+            !glm::any(glm::greaterThanEqual(idx, glm::ivec3(gridResolution, gridResolution, gridResolution)))) {
+            neighborCells[nCells++] = gridIndex3Dto1D(idx.x, idx.y, idx.z, gridResolution);
+          }
+        }
+      }
+    }
+  }
+  return nCells;
 }
 
 __global__ void kernUpdateVelNeighborSearchScattered(
@@ -349,12 +421,75 @@ __global__ void kernUpdateVelNeighborSearchScattered(
   glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
   // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce
   // the number of boids that need to be checked.
+  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (index >= N) {
+    return;
+  }
+
   // - Identify the grid cell that this particle is in
+  glm::ivec3 index3D;
+  unsigned char mask;
+  getGridIndexAndQuadrantMask(pos[index], gridMin, inverseCellWidth, index3D, mask);
+
   // - Identify which cells may contain neighbors. This isn't always 8.
-  // - For each cell, read the start/end indices in the boid pointer array.
-  // - Access each boid in the cell and compute velocity change from
-  //   the boids rules, if this boid is within the neighborhood distance.
+  int neighborCells[8];
+  int nCells = getNeighborCells(neighborCells, index3D, mask, gridResolution);
+
+  glm::vec3 posSelf = pos[index];
+  glm::vec3 cmass(0.f, 0.f, 0.f);
+  int cmassCount = 0;
+  glm::vec3 collisionV(0.f, 0.f, 0.f);
+  glm::vec3 cvel(0.f, 0.f, 0.f);
+  int cvelCount = 0;
+
+  for (int c = 0; c < nCells; ++c) {
+    // - For each cell, read the start/end indices in the boid pointer array.
+    int n = neighborCells[c];
+    int start = gridCellStartIndices[n];
+    int end = gridCellEndIndices[n];
+
+    for (int b = start; b < end; ++b) {
+      // - Access each boid in the cell and compute velocity change from
+      //   the boids rules, if this boid is within the neighborhood distance.
+      int particleIndex = particleArrayIndices[b];
+      if (particleIndex == index) continue;
+
+      // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+      if (glm::distance(pos[particleIndex], posSelf) < rule1Distance) {
+        cmass += pos[particleIndex];
+        cmassCount++;
+      }
+      // Rule 2: boids try to stay a distance d away from each other
+      if (glm::distance(pos[particleIndex], posSelf) < rule2Distance) {
+        collisionV -= pos[particleIndex] - posSelf;
+      }
+      // Rule 3: boids try to match the speed of surrounding boids
+      if (glm::distance(pos[particleIndex], posSelf) < rule3Distance) {
+        cvel += vel1[particleIndex];
+        cvelCount++;
+      }
+    }
+  }
+
+  cmass = cmassCount > 0 ? (cmass / (float)cmassCount) - pos[index] : cmass;
+  cvel = cvelCount > 0 ? cvel / (float)cvelCount : cvel;
+
+  glm::vec3 vel = vel1[index] + cmass * rule1Scale + collisionV * rule2Scale + cvel * rule3Scale;
+
   // - Clamp the speed change before putting the new speed in vel2
+  vel = glm::normalize(vel) * glm::min(maxSpeed, glm::length(vel));
+  vel2[index] = vel;
+}
+
+__global__ void kernShuffleParticleBuffers(int N, 
+    glm::vec3* srcPos, glm::vec3* srcVel, 
+    int* tgtIndices, glm::vec3* tgtPos, glm::vec3* tgtVel) {
+  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (index >= N) {
+    return;
+  }
+  tgtPos[index] = srcPos[tgtIndices[index]];
+  tgtVel[index] = srcVel[tgtIndices[index]];
 }
 
 __global__ void kernUpdateVelNeighborSearchCoherent(
@@ -366,14 +501,68 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
   // except with one less level of indirection.
   // This should expect gridCellStartIndices and gridCellEndIndices to refer
   // directly to pos and vel1.
+  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (index >= N) {
+    return;
+  }
+
   // - Identify the grid cell that this particle is in
+  glm::ivec3 index3D;
+  unsigned char mask;
+  getGridIndexAndQuadrantMask(pos[index], gridMin, inverseCellWidth, index3D, mask);
+
   // - Identify which cells may contain neighbors. This isn't always 8.
+  int neighborCells[8];
+  int nCells = getNeighborCells(neighborCells, index3D, mask, gridResolution);
+
+  glm::vec3 posSelf = pos[index];
+  glm::vec3 cmass(0.f, 0.f, 0.f);
+  int cmassCount = 0;
+  glm::vec3 collisionV(0.f, 0.f, 0.f);
+  glm::vec3 cvel(0.f, 0.f, 0.f);
+  int cvelCount = 0;
+
   // - For each cell, read the start/end indices in the boid pointer array.
   //   DIFFERENCE: For best results, consider what order the cells should be
   //   checked in to maximize the memory benefits of reordering the boids data.
-  // - Access each boid in the cell and compute velocity change from
-  //   the boids rules, if this boid is within the neighborhood distance.
+  for (int c = 0; c < nCells; ++c) {
+    int n = neighborCells[c];
+    int start = gridCellStartIndices[n];
+    int end = gridCellEndIndices[n];
+
+    for (int particleIndex = start; particleIndex < end; ++particleIndex) {
+      // - Access each boid in the cell and compute velocity change from
+      //   the boids rules, if this boid is within the neighborhood distance.
+      if (particleIndex == index) continue;
+
+      // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+      if (glm::distance(pos[particleIndex], posSelf) < rule1Distance) {
+        cmass += pos[particleIndex];
+        cmassCount++;
+      }
+      // Rule 2: boids try to stay a distance d away from each other
+      if (glm::distance(pos[particleIndex], posSelf) < rule2Distance) {
+        collisionV -= pos[particleIndex] - posSelf;
+      }
+      // Rule 3: boids try to match the speed of surrounding boids
+      if (glm::distance(pos[particleIndex], posSelf) < rule3Distance) {
+        cvel += vel1[particleIndex];
+        cvelCount++;
+      }
+    }
+  }
+
+  cmass = cmassCount > 0 ? (cmass / (float)cmassCount) - pos[index] : cmass;
+  cvel = cvelCount > 0 ? cvel / (float)cvelCount : cvel;
+
+  glm::vec3 vel = vel1[index] + cmass * rule1Scale + collisionV * rule2Scale + cvel * rule3Scale;
+
   // - Clamp the speed change before putting the new speed in vel2
+  float len = glm::length(vel);
+  if (len > maxSpeed) {
+    vel = glm::normalize(vel) * glm::min(maxSpeed, glm::length(vel));
+  }
+  vel2[index] = vel;
 }
 
 /**
@@ -392,43 +581,92 @@ void Boids::stepSimulationNaive(float dt) {
 
 void Boids::stepSimulationScatteredGrid(float dt) {
   // TODO-2.1
+
+  dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
   // Uniform Grid Neighbor search using Thrust sort.
   // In Parallel:
+
   // - label each particle with its array index as well as its grid index.
   //   Use 2x width grids.
+  kernComputeIndices<<< fullBlocksPerGrid, blockSize >>>(numObjects, gridSideCount, gridMinimum,
+    gridInverseCellWidth, dev_pos, dev_particleArrayIndices, 
+    dev_particleGridIndices);
+
   // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
   //   are welcome to do a performance comparison.
+  dev_thrust_particleGridIndices = thrust::device_ptr<int>(dev_particleGridIndices);
+  dev_thrust_particleArrayIndices = thrust::device_ptr<int>(dev_particleArrayIndices);
+  thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+
   // - Naively unroll the loop for finding the start and end indices of each
   //   cell's data pointers in the array of boid indices
+  kernIdentifyCellStartEnd<<< fullBlocksPerGrid, blockSize >>>(numObjects, dev_particleGridIndices,
+  dev_gridCellStartIndices, dev_gridCellEndIndices);
+
   // - Perform velocity updates using neighbor search
+  kernUpdateVelNeighborSearchScattered<<< fullBlocksPerGrid, blockSize >>>(numObjects, gridSideCount, gridMinimum,
+  gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, 
+  dev_gridCellEndIndices, dev_particleArrayIndices, dev_pos, 
+  dev_vel1, dev_vel2);
+
   // - Update positions
+  kernUpdatePos<<< fullBlocksPerGrid, blockSize >>>(numObjects, dt, dev_pos, dev_vel2);
+
   // - Ping-pong buffers as needed
+  std::swap(dev_vel1, dev_vel2);
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
   // TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid
   // Uniform Grid Neighbor search using Thrust sort on cell-coherent data.
+  dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
   // In Parallel:
   // - Label each particle with its array index as well as its grid index.
   //   Use 2x width grids
+  kernComputeIndices << < fullBlocksPerGrid, blockSize >> >(numObjects, gridSideCount, gridMinimum,
+    gridInverseCellWidth, dev_pos, dev_particleArrayIndices,
+    dev_particleGridIndices);
+
   // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
   //   are welcome to do a performance comparison.
+  dev_thrust_particleGridIndices = thrust::device_ptr<int>(dev_particleGridIndices);
+  dev_thrust_particleArrayIndices = thrust::device_ptr<int>(dev_particleArrayIndices);
+  thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+
   // - Naively unroll the loop for finding the start and end indices of each
   //   cell's data pointers in the array of boid indices
+  kernIdentifyCellStartEnd << < fullBlocksPerGrid, blockSize >> >(numObjects, dev_particleGridIndices,
+    dev_gridCellStartIndices, dev_gridCellEndIndices);
+
   // - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all
   //   the particle data in the simulation array.
   //   CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED
+  kernShuffleParticleBuffers << < fullBlocksPerGrid, blockSize >> >(
+    numObjects, dev_pos, dev_vel1, 
+    dev_particleArrayIndices, dev_pos2, dev_vel2);
+ // std::swap(dev_vel1, dev_vel2);
+  //std::swap(dev_pos, dev_pos2);
+
   // - Perform velocity updates using neighbor search
+  kernUpdateVelNeighborSearchCoherent << < fullBlocksPerGrid, blockSize >> >(numObjects, gridSideCount, gridMinimum,
+    gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices,
+    dev_gridCellEndIndices, dev_pos2, dev_vel2, dev_vel1);
+
   // - Update positions
-  // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+  kernUpdatePos << < fullBlocksPerGrid, blockSize >> >(numObjects, dt, dev_pos2, dev_vel1);
+
+  // - Ping-pong buffers as needed
+  //std::swap(dev_vel1, dev_vel2);
+  std::swap(dev_pos, dev_pos2);
 }
 
 void Boids::endSimulation() {
   cudaFree(dev_vel1);
   cudaFree(dev_vel2);
   cudaFree(dev_pos);
 
-  // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+  // TODO-2.1 TODO-2.3 - Free any additional buffers her.e
+  cudaFree(dev_pos2);
 }
 
 void Boids::unitTest() {

diff --git a/src/main.cpp b/src/main.cpp
@@ -14,11 +14,11 @@
 
 // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
 #define VISUALIZE 1
-#define UNIFORM_GRID 0
-#define COHERENT_GRID 0
+#define UNIFORM_GRID 1
+#define COHERENT_GRID 1
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
-const int N_FOR_VIS = 5000;
+const int N_FOR_VIS = 20000;
 const float DT = 0.2f;
 
 /**