diff --git a/README.md b/README.md
index 98dd9a8..392a4fd 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,50 @@
-**University of Pennsylvania, CIS 565: GPU Programming and Architecture,
-Project 1 - Flocking**
+# University of Pennsylvania, CIS 565: GPU Programming and Architecture
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+## Project 1 - Flocking
+* Liang Peng
+* Tested on: Windows 10, i7-6700HQ @ 2.6GHz, 8GB, GTX 960M (Personal Computer)
 
-### (TODO: Your README)
+## Screenshots
+* Rendering
+<br><img src="images/flocking.gif" width="500">
+* Profiling
+<br><img src="images/profiling.PNG" width="500">
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+## Performance Analysis
+
+### With Visualization
+
+Algorithm | Max Boid Count | Framerate (FPS)
+:---:|:---:|:---:
+Brute-Force | 5,000 | 60
+Scattered Uniform Grid | 55,000 | 60
+Coherent Uniform Grid | 120,000 | 60
+
+### Without Visualization
+
+Algorithm | Boid Count | Framerate (FPS)
+:---:|:---:|:---:
+Brute-Force | 5,000 | 72
+Scattered Uniform Grid | 5,000 | 590
+Coherent Uniform Grid | 5,000 | 640
+
+### Block Size
+
+Boid Count | Block Size | Framerate (FPS)
+:---:|:---:|:---:
+50000 | 16 | 109
+50000 | 32 | 170
+50000 | 64 | 182
+50000 | 128 | 180
+50000 | 256 | 170
+50000 | 1024 | 170
+
+### Conclusion
+* When boid count is small, framerate can be maintained at 60 fps. As  boid count increases, framerate will from some point drop below 60 fps.
+
+* Algorithm used to update boid positions and velocities has large influence on simulation performance.
+  * From 1.2 to 2.1, neighbor search efficiency is greatly improved by using grid index information.
+  * From 2.1 to 2.3, performance is further improved because cache-hit rate is enhanced by grouping data accessed by neighboring threads.
+
+*  Block size and block count has some impact on performance. As block size increases and block count decreases, framerate will rise and at some point drop.
+	* _My speculation_ If block size is too small, block count will be large. Since each block is processed by a core and number of core is limited, number of cycles to handle all blocks will increase. If block size is too big, since capacity of cache in that block shared by its threads is limited, replacement of data in cache will become more frequent and decrease cache-hit rate thus affect performance.
diff --git a/images/flocking.gif b/images/flocking.gif
new file mode 100644
index 0000000..406136b
Binary files /dev/null and b/images/flocking.gif differ
diff --git a/images/profiling.PNG b/images/profiling.PNG
new file mode 100644
index 0000000..5071a37
Binary files /dev/null and b/images/profiling.PNG differ
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index fdd636d..dff0113 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -10,5 +10,5 @@ set(SOURCE_FILES
 
 cuda_add_library(src
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_50
     )
diff --git a/src/kernel.cu b/src/kernel.cu
index 30356b9..818ce74 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -25,10 +25,10 @@ void checkCUDAError(const char *msg, int line = -1) {
   if (cudaSuccess != err) {
     if (line >= 0) {
       fprintf(stderr, "Line %d: ", line);
-    }
+   }
     fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err));
     exit(EXIT_FAILURE);
-  }
+ }
 }
 
 
@@ -70,6 +70,8 @@ glm::vec3 *dev_pos;
 glm::vec3 *dev_vel1;
 glm::vec3 *dev_vel2;
 
+glm::vec3 *dev_pos_copy;
+glm::vec3 *dev_vel1_copy;
 // LOOK-2.1 - these are NOT allocated for you. You'll have to set up the thrust
 // pointers on your own too.
 
@@ -83,6 +85,23 @@ thrust::device_ptr<int> dev_thrust_particleGridIndices;
 int *dev_gridCellStartIndices; // What part of dev_particleArrayIndices belongs
 int *dev_gridCellEndIndices;   // to this cell?
 
+// steps for iterate on neighboring cells
+int dxData[] = {
+	-1, 0, -1, 0, -1, 0, -1, 0,
+	0, 1, 0, 1, 0, 1, 0, 1,
+};
+int dyData[] = {
+	-1, -1, 0, 0, -1, -1, 0, 0,
+	0, 0, 1, 1, 0, 0, 1, 1,
+};
+int dzData[] = {
+	-1, -1, -1, -1, 0, 0, 0, 0,
+	0, 0, 0, 0, 1, 1, 1, 1,
+};
+int *dev_dx;
+int *dev_dy;
+int *dev_dz;
+
 // TODO-2.3 - consider what additional buffers you might need to reshuffle
 // the position and velocity data to be coherent within cells.
 
@@ -130,7 +149,7 @@ __global__ void kernGenerateRandomPosArray(int time, int N, glm::vec3 * arr, flo
     arr[index].x = scale * rand.x;
     arr[index].y = scale * rand.y;
     arr[index].z = scale * rand.z;
-  }
+ }
 }
 
 /**
@@ -169,6 +188,47 @@ void Boids::initSimulation(int N) {
   gridMinimum.z -= halfGridWidth;
 
   // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+  // allocate memory on GPU
+  cudaMalloc((void**)&dev_particleArrayIndices, sizeof(int) * N);
+  checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!");
+
+  cudaMalloc((void**)&dev_particleGridIndices, sizeof(int) * N);
+  checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!");
+
+  cudaMalloc((void**)&dev_gridCellStartIndices, sizeof(int) * gridCellCount);
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!");
+
+  cudaMalloc((void**)&dev_gridCellEndIndices, sizeof(int) * gridCellCount);
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!");
+
+  // wrap raw pointer
+  dev_thrust_particleArrayIndices = thrust::device_ptr<int>(dev_particleArrayIndices);
+  dev_thrust_particleGridIndices = thrust::device_ptr<int>(dev_particleGridIndices);
+
+  // fill step data
+  cudaMalloc((void**)&dev_dx, sizeof(dxData));
+  checkCUDAErrorWithLine("cudaMalloc dev_dx failed!");
+
+  cudaMalloc((void**)&dev_dy, sizeof(dyData));
+  checkCUDAErrorWithLine("cudaMalloc dev_dy failed!");
+
+  cudaMalloc((void**)&dev_dz, sizeof(dzData));
+  checkCUDAErrorWithLine("cudaMalloc dev_dz failed!");
+
+  thrust::device_ptr<int> dev_thrust_dx(dev_dx);
+  thrust::device_ptr<int> dev_thrust_dy(dev_dy);
+  thrust::device_ptr<int> dev_thrust_dz(dev_dz);
+
+  thrust::copy(dxData, dxData + (sizeof(dxData) >> 2), dev_thrust_dx);
+  thrust::copy(dyData, dyData + (sizeof(dyData) >> 2), dev_thrust_dy);
+  thrust::copy(dzData, dzData + (sizeof(dzData) >> 2), dev_thrust_dz);
+
+  cudaMalloc((void**)&dev_pos_copy, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_pos_copy failed!");
+
+  cudaMalloc((void**)&dev_vel1_copy, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_vel1_copy failed!");
+
   cudaThreadSynchronize();
 }
 
@@ -190,7 +250,7 @@ __global__ void kernCopyPositionsToVBO(int N, glm::vec3 *pos, float *vbo, float
     vbo[4 * index + 1] = pos[index].y * c_scale;
     vbo[4 * index + 2] = pos[index].z * c_scale;
     vbo[4 * index + 3] = 1.0f;
-  }
+ }
 }
 
 __global__ void kernCopyVelocitiesToVBO(int N, glm::vec3 *vel, float *vbo, float s_scale) {
@@ -201,7 +261,7 @@ __global__ void kernCopyVelocitiesToVBO(int N, glm::vec3 *vel, float *vbo, float
     vbo[4 * index + 1] = vel[index].y + 0.3f;
     vbo[4 * index + 2] = vel[index].z + 0.3f;
     vbo[4 * index + 3] = 1.0f;
-  }
+ }
 }
 
 /**
@@ -229,11 +289,78 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities)
 * Compute the new velocity on the body with index `iSelf` due to the `N` boids
 * in the `pos` and `vel` arrays.
 */
-__device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) {
-  // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
-  // Rule 2: boids try to stay a distance d away from each other
-  // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+__device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos,
+    const glm::vec3 *vel) {
+
+	glm::vec3 vDelta = {0.f, 0.f, 0.f};
+
+#ifndef RULE1
+	// Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+	glm::vec3 centerOfMass = {0.f, 0.f, 0.f};
+	int numOfNeighbors1 = 0;
+
+	for (int i = 0; i < N; ++i) {
+		// self boid, skip
+		if (iSelf == i) {
+			continue;
+		}
+		// not neighbor, skip
+		if (glm::length(pos[iSelf] - pos[i]) > rule1Distance) {
+			continue;
+		}
+
+		centerOfMass += pos[i];
+		++numOfNeighbors1;
+	}
+
+	if (numOfNeighbors1 > 0) {
+		centerOfMass /= numOfNeighbors1 + 0.f;
+		vDelta += (centerOfMass - pos[iSelf]) * rule1Scale;
+	}
+#endif
+
+#ifndef RULE2
+	// Rule 2: boids try to stay a distance d away from each other
+	for (int i = 0; i < N; ++i) {
+		// self boid, skip
+		if (iSelf == i) {
+			continue;
+		}
+		// not neighbor, skip
+		if (glm::length(pos[iSelf] - pos[i]) > rule2Distance) {
+			continue;
+		}
+
+		vDelta += (pos[iSelf] - pos[i]) * rule2Scale;
+	}
+#endif
+
+#ifndef RULE3
+	// Rule 3: boids try to match the speed of surrounding boids
+	glm::vec3 vAverage = {0.f, 0.f, 0.f};
+	int numOfNeighbor3 = 0;
+
+	for (int i = 0; i < N; ++i) {
+		// self boid, skip
+		if (iSelf == i) {
+			continue;
+		}
+		// not neighbor, skip
+		if (glm::length(pos[iSelf] - pos[i]) > rule3Distance) {
+			continue;
+		}
+
+		vAverage += vel[i];
+		++numOfNeighbor3;
+	}
+
+	if (numOfNeighbor3 > 0) {
+		vAverage /= numOfNeighbor3 + 0.f;
+		vDelta += vAverage * rule3Scale;
+	}
+#endif
+
+  return vDelta;
 }
 
 /**
@@ -242,9 +369,25 @@ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *po
 */
 __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
   glm::vec3 *vel1, glm::vec3 *vel2) {
+
+	int index = blockIdx.x * blockDim.x + threadIdx.x;
+
+	// index out of bound
+	if (index >= N) {
+		return;
+	}
+
   // Compute a new velocity based on pos and vel1
+	glm::vec3 vDelta = computeVelocityChange(N, index, pos, vel1);
+	glm::vec3 vUpdated = vel1[index] + vDelta;
+
   // Clamp the speed
+	if (glm::length(vUpdated) > maxSpeed) {
+		vUpdated = glm::normalize(vUpdated) * maxSpeed;
+	}
+
   // Record the new velocity into vel2. Question: why NOT vel1?
+	vel2[index] = vUpdated;
 }
 
 /**
@@ -256,7 +399,7 @@ __global__ void kernUpdatePos(int N, float dt, glm::vec3 *pos, glm::vec3 *vel) {
   int index = threadIdx.x + (blockIdx.x * blockDim.x);
   if (index >= N) {
     return;
-  }
+ }
   glm::vec3 thisPos = pos[index];
   thisPos += vel[index] * dt;
 
@@ -282,13 +425,37 @@ __device__ int gridIndex3Dto1D(int x, int y, int z, int gridResolution) {
   return x + y * gridResolution + z * gridResolution * gridResolution;
 }
 
+__device__ void computeGridIndex3D(const glm::vec3 &pos, const glm::vec3 &min,
+    const float inverseCellWidth, int &indexX, int &indexY, int &indexZ) {
+  glm::vec3 diff = pos - min;
+  glm::vec3 indexDiff = diff * inverseCellWidth;
+  indexX = static_cast<int>(indexDiff.x);
+  indexY = static_cast<int>(indexDiff.y);
+  indexZ = static_cast<int>(indexDiff.z);
+}
+
 __global__ void kernComputeIndices(int N, int gridResolution,
   glm::vec3 gridMin, float inverseCellWidth,
   glm::vec3 *pos, int *indices, int *gridIndices) {
     // TODO-2.1
     // - Label each boid with the index of its grid cell.
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
+
+	if (index >= N) {
+		return;
+	}
+
+	glm::vec3 diff = pos[index] - gridMin;
+	int indexX = (int)(diff.x * inverseCellWidth);
+	int indexY = (int)(diff.y * inverseCellWidth);
+	int indexZ = (int)(diff.z * inverseCellWidth);
+	int gridIndex = gridIndex3Dto1D(indexX, indexY, indexZ, gridResolution);
+
+	gridIndices[index] = gridIndex;
+
     // - Set up a parallel array of integer indices as pointers to the actual
     //   boid data in pos and vel1/vel2
+	indices[index] = index;
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
@@ -297,7 +464,7 @@ __global__ void kernResetIntBuffer(int N, int *intBuffer, int value) {
   int index = (blockIdx.x * blockDim.x) + threadIdx.x;
   if (index < N) {
     intBuffer[index] = value;
-  }
+ }
 }
 
 __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
@@ -306,14 +473,47 @@ __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
   // Identify the start point of each cell in the gridIndices array.
   // This is basically a parallel unrolling of a loop that goes
   // "this index doesn't match the one before it, must be a new cell!"
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+	if (index >= N) {
+		// index out of bound
+		return;
+	} else if (index == 0) {
+		// set start of first cell
+		gridCellStartIndices[particleGridIndices[index]] = index;
+		return;
+	} else if (index == N - 1) {
+		// set end of last cell
+		gridCellEndIndices[particleGridIndices[index]] = index;
+	}
+
+	if (particleGridIndices[index] != particleGridIndices[index - 1]) {
+		// set start of current cell and end of previous cell
+		gridCellEndIndices[particleGridIndices[index - 1]] = index - 1;
+		gridCellStartIndices[particleGridIndices[index]] = index;
+	}
+}
+
+__global__ void kernRearrangeBoidData(int N, int *indices, glm::vec3 *pos,
+		glm::vec3 *pos_copy, glm::vec3 *vel1, glm::vec3 *vel1_copy) {
+
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+	if (index >= N) {
+		return;
+	}
+
+	pos[index] = pos_copy[indices[index]];
+	vel1[index] = vel1_copy[indices[index]];
 }
 
 __global__ void kernUpdateVelNeighborSearchScattered(
   int N, int gridResolution, glm::vec3 gridMin,
   float inverseCellWidth, float cellWidth,
   int *gridCellStartIndices, int *gridCellEndIndices,
-  int *particleArrayIndices,
-  glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
+  int *particleGridIndices, int *particleArrayIndices,
+  int *dx, int *dy, int *dz, glm::vec3 *pos, glm::vec3 *vel1,
+  glm::vec3 *vel2) {
   // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce
   // the number of boids that need to be checked.
   // - Identify the grid cell that this particle is in
@@ -322,13 +522,122 @@ __global__ void kernUpdateVelNeighborSearchScattered(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+
+	// handle each boid, in the order of cell index
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+	if (index >= N) {
+		// index out of bound
+		return;
+	}
+
+	int cellIndexX;
+	int cellIndexY;
+	int cellIndexZ;
+
+	computeGridIndex3D(pos[index], gridMin, inverseCellWidth, cellIndexX,
+			cellIndexY, cellIndexZ);
+
+	glm::vec3 posMid = gridMin + glm::vec3(cellIndexX + .5f, cellIndexY + .5f,
+			cellIndexZ + .5f) * cellWidth;
+	int stepSchemeX = pos[index].x < posMid.x ? 0 : 8;
+	int stepSchemeY = pos[index].y < posMid.y ? 0 : 8;
+	int stepSchemeZ = pos[index].z < posMid.z ? 0 : 8;
+
+	// handle neighbors in the eight neighboring cells
+	glm::vec3 vDelta = {0.f, 0.f, 0.f};
+	glm::vec3 centerOfMass = {0.f, 0.f, 0.f};
+	glm::vec3 averageVelocity = {0.f, 0.f, 0.f};
+	int numNeighbors1 = 0;
+	int numNeighbors3 = 0;
+
+	for (int j = 0; j < 8; ++j) {
+
+		int newCellIndexX = cellIndexX + dx[stepSchemeX + j];
+		int newCellIndexY = cellIndexY + dy[stepSchemeY + j];
+		int newCellIndexZ = cellIndexZ + dz[stepSchemeZ + j];
+
+		if (newCellIndexX < 0 || newCellIndexX >= gridResolution
+				|| newCellIndexY < 0 || newCellIndexY >= gridResolution
+				|| newCellIndexZ < 0 || newCellIndexZ >= gridResolution) {
+			continue;
+		}
+
+		int newCellIndex = gridIndex3Dto1D(newCellIndexX,
+				newCellIndexY, newCellIndexZ, gridResolution);
+		int startIndex = gridCellStartIndices[newCellIndex];
+		int endIndex = gridCellEndIndices[newCellIndex];
+
+#ifndef RULE1
+		// gather to center of mass
+		for (int k = startIndex; k != -1 && k <= endIndex; ++k) {
+
+			int neighborIndex = particleArrayIndices[k];
+
+			if (index == neighborIndex) {
+				continue;
+			}
+			if (glm::length(pos[index] - pos[neighborIndex]) > rule1Distance) {
+				continue;
+			}
+			centerOfMass += pos[neighborIndex];
+			++numNeighbors1;
+		}
+#endif
+#ifndef RULE2
+		// stay separated
+		for (int k = startIndex; k != -1 && k <= endIndex; ++k) {
+
+			int neighborIndex = particleArrayIndices[k];
+
+			if (index == neighborIndex) {
+				continue;
+			}
+			if (glm::length(pos[index] - pos[neighborIndex]) > rule2Distance) {
+				continue;
+			}
+			vDelta += (pos[index] - pos[neighborIndex]) * rule2Scale;
+		}
+#endif
+#ifndef RULE3
+		// follow neighbor velocity
+		for (int k = startIndex; k != -1 && k <= endIndex; ++k) {
+
+			int neighborIndex = particleArrayIndices[k];
+
+			if (index == neighborIndex) {
+				continue;
+			}
+			if (glm::length(pos[index] - pos[neighborIndex]) > rule3Distance) {
+				continue;
+			}
+			averageVelocity += vel1[neighborIndex];
+			++numNeighbors3;
+		}
+#endif
+	}
+
+	if (numNeighbors1 > 0) {
+		centerOfMass /= numNeighbors1 + 0.f;
+		vDelta += (centerOfMass - pos[index]) * rule1Scale;
+	}
+	if (numNeighbors3 > 0) {
+		averageVelocity /= numNeighbors3 + 0.f;
+		vDelta += averageVelocity * rule3Scale;
+	}
+
+	glm::vec3 vUpdated = vel1[index] + vDelta;
+
+	if (glm::length(vUpdated) > maxSpeed) {
+		vUpdated = glm::normalize(vUpdated) * maxSpeed;
+	}
+	vel2[index] = vUpdated;
 }
 
 __global__ void kernUpdateVelNeighborSearchCoherent(
-  int N, int gridResolution, glm::vec3 gridMin,
-  float inverseCellWidth, float cellWidth,
-  int *gridCellStartIndices, int *gridCellEndIndices,
-  glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
+  int N, int gridResolution, glm::vec3 gridMin, float inverseCellWidth, float cellWidth,
+  int *gridCellStartIndices, int *gridCellEndIndices, int *particleGridIndices,
+  int *dx, int *dy, int *dz, glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
   // TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered,
   // except with one less level of indirection.
   // This should expect gridCellStartIndices and gridCellEndIndices to refer
@@ -341,6 +650,107 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+
+	// handle each boid, in the order of cell index
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+	if (index >= N) {
+		// index out of bound
+		return;
+	}
+
+  	int cellIndexX;
+  	int cellIndexY;
+  	int cellIndexZ;
+
+  	computeGridIndex3D(pos[index], gridMin, inverseCellWidth, cellIndexX,
+  			cellIndexY, cellIndexZ);
+
+  	glm::vec3 posMid = gridMin + glm::vec3(cellIndexX + .5f, cellIndexY + .5f,
+  			cellIndexZ + .5f) * cellWidth;
+	int stepSchemeX = pos[index].x < posMid.x ? 0 : 8;
+	int stepSchemeY = pos[index].y < posMid.y ? 0 : 8;
+	int stepSchemeZ = pos[index].z < posMid.z ? 0 : 8;
+
+	// handle neighbors in the eight neighboring cells
+	glm::vec3 vDelta = {0.f, 0.f, 0.f};
+	glm::vec3 centerOfMass = {0.f, 0.f, 0.f};
+	glm::vec3 averageVelocity = {0.f, 0.f, 0.f};
+	int numNeighbors1 = 0;
+	int numNeighbors3 = 0;
+
+	for (int j = 0; j < 8; ++j) {
+
+		int newCellIndexX = cellIndexX + dx[stepSchemeX + j];
+		int newCellIndexY = cellIndexY + dy[stepSchemeY + j];
+		int newCellIndexZ = cellIndexZ + dz[stepSchemeZ + j];
+
+		if (newCellIndexX < 0 || newCellIndexX >= gridResolution
+				|| newCellIndexY < 0 || newCellIndexY >= gridResolution
+				|| newCellIndexZ < 0 || newCellIndexZ >= gridResolution) {
+			continue;
+		}
+
+		int newCellIndex = gridIndex3Dto1D(newCellIndexX,
+				newCellIndexY, newCellIndexZ, gridResolution);
+		int startIndex = gridCellStartIndices[newCellIndex];
+		int endIndex = gridCellEndIndices[newCellIndex];
+
+#ifndef RULE1
+		// gather to center of mass
+		for (int k = startIndex; k != -1 && k <= endIndex; ++k) {
+			if (index == k) {
+				continue;
+			}
+			if (glm::length(pos[index] - pos[k]) > rule1Distance) {
+				continue;
+			}
+			centerOfMass += pos[k];
+			++numNeighbors1;
+		}
+#endif
+#ifndef RULE2
+		// stay separated
+		for (int k = startIndex; k != -1 && k <= endIndex; ++k) {
+			if (index == k) {
+				continue;
+			}
+			if (glm::length(pos[index] - pos[k]) > rule2Distance) {
+				continue;
+			}
+			vDelta += (pos[index] - pos[k]) * rule2Scale;
+		}
+#endif
+#ifndef RULE3
+		// follow neighbor velocity
+		for (int k = startIndex; k != -1 && k <= endIndex; ++k) {
+			if (index == k) {
+				continue;
+			}
+			if (glm::length(pos[index] - pos[k]) > rule3Distance) {
+				continue;
+			}
+			averageVelocity += vel1[k];
+			++numNeighbors3;
+		}
+#endif
+	}
+
+	if (numNeighbors1 > 0) {
+		centerOfMass /= numNeighbors1 + 0.f;
+		vDelta += (centerOfMass - pos[index]) * rule1Scale;
+	}
+	if (numNeighbors3 > 0) {
+		averageVelocity /= numNeighbors3 + 0.f;
+		vDelta += averageVelocity * rule3Scale;
+	}
+
+	glm::vec3 vUpdated = vel1[index] + vDelta;
+
+	if (glm::length(vUpdated) > maxSpeed) {
+		vUpdated = glm::normalize(vUpdated) * maxSpeed;
+	}
+	vel2[index] = vUpdated;
 }
 
 /**
@@ -348,7 +758,15 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
 */
 void Boids::stepSimulationNaive(float dt) {
   // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
+	dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+	kernUpdateVelocityBruteForce<<<fullBlocksPerGrid, blockSize>>>(numObjects, dev_pos, dev_vel1, dev_vel2);
+	kernUpdatePos<<<fullBlocksPerGrid, blockSize>>>(numObjects, dt, dev_pos, dev_vel2);
+
   // TODO-1.2 ping-pong the velocity buffers
+	auto tmp = dev_vel1;
+	dev_vel1 = dev_vel2;
+	dev_vel2 = tmp;
 }
 
 void Boids::stepSimulationScatteredGrid(float dt) {
@@ -364,6 +782,36 @@ void Boids::stepSimulationScatteredGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed
+
+	// sort arrays
+	dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+	kernComputeIndices<<<fullBlocksPerGrid, blockSize>>>(numObjects,
+			gridSideCount, gridMinimum, gridInverseCellWidth,
+			dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+
+	thrust::sort_by_key(dev_thrust_particleGridIndices,
+			dev_thrust_particleGridIndices + numObjects,
+			dev_thrust_particleArrayIndices);
+
+	cudaMemset(dev_gridCellStartIndices, -1, sizeof(int) * gridCellCount);
+	cudaMemset(dev_gridCellEndIndices, -1, sizeof(int) * gridCellCount);
+
+	kernIdentifyCellStartEnd<<<fullBlocksPerGrid, blockSize>>>(numObjects,
+			dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+
+	kernUpdateVelNeighborSearchScattered<<<fullBlocksPerGrid, blockSize>>>(
+			numObjects, gridSideCount, gridMinimum, gridInverseCellWidth,
+			gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices,
+			dev_particleGridIndices, dev_particleArrayIndices, dev_dx, dev_dy, dev_dz,
+			dev_pos, dev_vel1, dev_vel2);
+
+	kernUpdatePos<<<fullBlocksPerGrid, blockSize>>>(numObjects, dt, dev_pos, dev_vel2);
+
+	// ping-pong buffers
+	auto tmp = dev_vel1;
+	dev_vel1 = dev_vel2;
+	dev_vel2 = tmp;
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
@@ -382,6 +830,46 @@ void Boids::stepSimulationCoherentGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+
+	// sort arrays
+	dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+	kernComputeIndices<<<fullBlocksPerGrid, blockSize>>>(numObjects,
+			gridSideCount, gridMinimum, gridInverseCellWidth,
+			dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+
+	thrust::sort_by_key(dev_thrust_particleGridIndices,
+			dev_thrust_particleGridIndices + numObjects,
+			dev_thrust_particleArrayIndices);
+
+	// identify grid start/end index
+	cudaMemset(dev_gridCellStartIndices, -1, sizeof(int) * gridCellCount);
+	cudaMemset(dev_gridCellEndIndices, -1, sizeof(int) * gridCellCount);
+
+	kernIdentifyCellStartEnd<<<fullBlocksPerGrid, blockSize>>>(numObjects,
+			dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+
+	// rearrange boid data order
+	cudaMemcpy((void*)dev_pos_copy, (void*)dev_pos, sizeof(glm::vec3) * numObjects,
+			cudaMemcpyDeviceToDevice);
+	cudaMemcpy((void*)dev_vel1_copy, (void*)dev_vel1, sizeof(glm::vec3) * numObjects,
+			cudaMemcpyDeviceToDevice);
+
+	kernRearrangeBoidData<<<fullBlocksPerGrid, blockSize>>>(numObjects,
+			dev_particleArrayIndices,	dev_pos, dev_pos_copy, dev_vel1, dev_vel1_copy);
+
+	// update boid velocity
+	kernUpdateVelNeighborSearchCoherent<<<fullBlocksPerGrid, blockSize>>>(
+			numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth,
+			dev_gridCellStartIndices, dev_gridCellEndIndices, dev_particleGridIndices,
+			dev_dx, dev_dy, dev_dz, dev_pos, dev_vel1, dev_vel2);
+
+	kernUpdatePos<<<fullBlocksPerGrid, blockSize>>>(numObjects, dt, dev_pos, dev_vel2);
+
+	// ping-pong buffers
+	auto tmp = dev_vel1;
+	dev_vel1 = dev_vel2;
+	dev_vel2 = tmp;
 }
 
 void Boids::endSimulation() {
@@ -390,6 +878,15 @@ void Boids::endSimulation() {
   cudaFree(dev_pos);
 
   // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+  cudaFree(dev_particleArrayIndices);
+  cudaFree(dev_particleGridIndices);
+  cudaFree(dev_gridCellStartIndices);
+  cudaFree(dev_gridCellEndIndices);
+  cudaFree(dev_pos_copy);
+  cudaFree(dev_vel1_copy);
+  cudaFree(dev_dx);
+  cudaFree(dev_dy);
+  cudaFree(dev_dz);
 }
 
 void Boids::unitTest() {
@@ -426,7 +923,7 @@ void Boids::unitTest() {
   for (int i = 0; i < N; i++) {
     std::cout << "  key: " << intKeys[i];
     std::cout << " value: " << intValues[i] << std::endl;
-  }
+ }
 
   // How to copy data to the GPU
   cudaMemcpy(dev_intKeys, intKeys, sizeof(int) * N, cudaMemcpyHostToDevice);
@@ -435,6 +932,7 @@ void Boids::unitTest() {
   // Wrap device vectors in thrust iterators for use with thrust.
   thrust::device_ptr<int> dev_thrust_keys(dev_intKeys);
   thrust::device_ptr<int> dev_thrust_values(dev_intValues);
+
   // LOOK-2.1 Example for using thrust::sort_by_key
   thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + N, dev_thrust_values);
 
@@ -447,11 +945,11 @@ void Boids::unitTest() {
   for (int i = 0; i < N; i++) {
     std::cout << "  key: " << intKeys[i];
     std::cout << " value: " << intValues[i] << std::endl;
-  }
+ }
 
   // cleanup
-  delete(intKeys);
-  delete(intValues);
+  delete [] intKeys;
+  delete [] intValues;
   cudaFree(dev_intKeys);
   cudaFree(dev_intValues);
   checkCUDAErrorWithLine("cudaFree failed!");
diff --git a/src/main.cpp b/src/main.cpp
index e416836..ab50f72 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -14,11 +14,11 @@
 
 // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
 #define VISUALIZE 1
-#define UNIFORM_GRID 0
-#define COHERENT_GRID 0
+#define UNIFORM_GRID 1
+#define COHERENT_GRID 1
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
-const int N_FOR_VIS = 5000;
+const int N_FOR_VIS = 100000;
 const float DT = 0.2f;
 
 /**