diff --git a/CMakeLists.txt b/CMakeLists.txt
index 76a65a0..c6816e2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -67,7 +67,7 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
     set(CUDA_PROPAGATE_HOST_FLAGS OFF)
 endif()
 
-#add_subdirectory(stream_compaction)  # TODO: uncomment if using your stream compaction
+add_subdirectory(stream_compaction)  # TODO: uncomment if using your stream compaction
 add_subdirectory(src)
 
 cuda_add_executable(${CMAKE_PROJECT_NAME}
@@ -77,7 +77,7 @@ cuda_add_executable(${CMAKE_PROJECT_NAME}
 
 target_link_libraries(${CMAKE_PROJECT_NAME}
     src
-    #stream_compaction  # TODO: uncomment if using your stream compaction
+    stream_compaction  # TODO: uncomment if using your stream compaction
     ${CORELIBS}
     )
 
diff --git a/README.md b/README.md
index 374266e..7246c6b 100644
--- a/README.md
+++ b/README.md
@@ -3,303 +3,48 @@ CUDA Path Tracer
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 3**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Levi Cai
+* Tested on: Windows 8, i7-5500U @ 2.4GHz 12GB RAM, NVIDIA GeForce GTX 940M 2GB
 
-### (TODO: Your README)
+This is a path tracer implemented in CUDA that supports basic ray casting, diffuse and specular surfaces, sphere and cube intersections, work efficient-stream compaction with shared memory, motion blurring, refraction, and depth-of-field.
 
-*DO NOT* leave the README to the last minute! It is a crucial part of the
-project, and we will not be able to grade you without a good README.
+### Examples of Features Implemented
 
-Instructions (delete me)
-========================
+## Specular vs. Diffuse Surfaces
 
-This is due Thursday, September 24 evening at midnight.
+![](img/cornell_specular.png)
 
-**Summary:**
-In this project, you'll implement a CUDA-based path tracer capable of rendering
-globally-illuminated images very quickly.
-Since in this class we are concerned with working in GPU programming,
-performance, and the generation of actual beautiful images (and not with
-mundane programming tasks like I/O), this project includes base code for
-loading a scene description file, described below, and various other things
-that generally make up a framework for previewing and saving images.
+Here I demonstrate the difference in specular vs. diffuse surfaces. The colors are simply determined by intensities associated with the materials, the direction that the rays are then scattered are determined by the contribution of specular color vs. diffuse color of the object probabilistically. If the a particular ray is determined to be specular, we simply compute the surface normal of the reflection and the new ray direction is v = v - 2 * dot(v,n) * n. If it is diffuse, then we compute the new direction according to a randomly sampled hemisphere around the surface normal with cosine weighting.
 
-The core renderer is left for you to implement. Finally, note that, while this
-base code is meant to serve as a strong starting point for a CUDA path tracer,
-you are not required to use it if you don't want to. You may also change any
-part of the base code as you please. **This is YOUR project.**
+## Motion Blurring
 
-**Recommendation:** Every image you save should automatically get a different
-filename. Don't delete all of them! For the benefit of your README, keep a
-bunch of them around so you can pick a few to document your progress at the
-end.
+![](img/cornell_bounce.png)
 
-### Contents
+Motion blurring is achieved by specifying a start and goal position for objects in the scene that are moving. Then before each iteration of path tracing, we randomly and uniformly adjust the position of the object along that vector, then proceed the iteration as before. This allows us to get a fairly nice motion blur that still works regardless of the number of iterations requested. An alternative is to attempt to move the object iteration by iteration (and not randomly), but it is difficult to tell how much it should move each iteration because they do not necessarily correspond to reasonable time intervals.
 
-* `src/` C++/CUDA source files.
-* `scenes/` Example scene description files.
-* `img/` Renders of example scene description files.
-  (These probably won't match precisely with yours.)
-* `external/` Includes and static libraries for 3rd party libraries.
+## Refraction (Glass)
 
+![](img/cornell_glass.png)
 
-### Running the code
+Refraction is created by using Schlick's approximation in order to determine the amount of rays that are specular reflections and the rest are refractive. Refractions are then computed according to glm::refract.
 
-The main function requires a scene description file. Call the program with
-one as an argument: `cis565_path_tracer scenes/sphere.txt`.
-(In Visual Studio, `../scenes/sphere.txt`.)
+## Depth of Field
 
-If you are using Visual Studio, you can set this in the Debugging > Command
-Arguments section in the Project properties. Make sure you get the path right -
-read the console for errors.
+![](img/cornell_dof.png)
 
-#### Controls
+The depth of field feature is accomplished by taking the initial rays that are projected from the origin, finding their intersection to a plane located at the focal length away from the camera, and then finding new ray directions by adjusting the camera origin randomly each iteration according to the aperture size.
 
-* Esc to save an image and exit.
-* Space to save an image. Watch the console for the output filename.
-* W/A/S/D and R/F move the camera. Arrow keys rotate.
+### Analysis of Work-Efficient Shared Memory Stream Compaction
 
-## Requirements
+I implemented the work-efficient stream compaction with shared memory scan function as described by http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html
 
-**Ask on the mailing list for clarifications.**
+This allows for significant performance gains in OPEN environments where rays are quickly terminated (or in environments with an extreme number of lights). We can see some of the effects in the graphs below:
 
-In this project, you are given code for:
+![](img/cornell_alive_vs_depth.png)
 
-* Loading and reading the scene description format
-* Sphere and box intersection functions
-* Support for saving images
-* Working CUDA-GL interop for previewing your render while it's running
-* A function which generates random screen noise (instead of an actual render).
+(Above) I compared the depth in the original cornell image with the number of still-living rays. There is a logarithmic drop-off in the scene of number of still living rays that must be tracked, but for the first several depths the gains are quite dramatic as rays leave the open side of the box quickly.
 
-You will need to implement the following features:
+![](img/cornell_initial_vs_time.png)
 
-* Raycasting from the camera into the scene through an imaginary grid of pixels
-  (the screen)
-  * Implement antialiasing (by jittering rays within each pixel)
-* Diffuse surfaces
-* Perfectly specular-reflective (mirrored) surfaces
-  * See notes on diffuse/specular in `scatterRay` and on specular below
-* Stream compaction optimization. You may use any of:
-  * Your global-memory work-efficient stream compaction implementation.
-  * A shared-memory work-efficient stream compaction (see below).
-  * `thrust::remove_if` or any of the other Thrust stream compaction functions.
+(Above) Here we wish to see the effect of very open environments vs. the runtime. To see this, I simply moved the camera's starting position back in the original Cornell image setup (so many rays immediately terminate) and we can see that the performance increases linearly with the number of rays that immediately terminate. We can thus imagine a completely enclosed scene to gain nothing from stream compaction (if anything, it hurts a small amount to do the extra computation due to the cudaMallocs and such, though many of those could possibly be avoided).
 
-You are also required to implement at least 2 of the following features.
-Please ask if you need good references (they will be added to this README
-later on). If you find good references, share them! **Extra credit**: implement
-more features on top of the 2 required ones, with point value up to +20/100 at
-the grader's discretion (based on difficulty and coolness).
-
-* Work-efficient stream compaction using shared memory across multiple blocks
-  (See *GPU Gems 3* Chapter 39).
-* These 2 smaller features:
-  * Refraction (e.g. glass/water) with Frensel effects using Schlick's
-    approximation or more accurate methods
-  * Physically-based depth-of-field (by jittering rays within an aperture)
-  * Recommended but not required: non-perfect specular surfaces
-* Texture mapping
-* Bump mapping
-* Direct lighting (by taking a final ray directly to a random point on an
-  emissive object acting as a light source)
-* Some method of defining object motion, and motion blur
-* Subsurface scattering
-* Arbitrary mesh loading and rendering (e.g. `obj` files). You can find these
-  online or export them from your favorite 3D modeling application.
-  With approval, you may use a third-party OBJ loading code to bring the data
-  into C++.
-  * You can use the triangle intersection function `glm::intersectRayTriangle`.
-
-This 'extra features' list is not comprehensive. If you have a particular idea
-you would like to implement (e.g. acceleration structures, etc.), please
-contact us first.
-
-For each extra feature, you must provide the following analysis:
-
-* Overview write-up of the feature
-* Performance impact of the feature
-* If you did something to accelerate the feature, what did you do and why?
-* Compare your GPU version of the feature to a HYPOTHETICAL CPU version
-  (you don't have to implement it!) Does it benefit or suffer from being
-  implemented on the GPU?
-* How might this feature be optimized beyond your current implementation?
-
-## Base Code Tour
-
-You'll be working in the following files. Look for important parts of the code:
-search for `CHECKITOUT`. You'll have to implement parts labeled with `TODO`.
-(But don't let these constrain you - you have free rein!)
-
-* `src/pathtrace.cu`: path tracing kernels, device functions, and calling code
-  * `pathtraceInit` initializes the path tracer state - it should copy
-    scene data (e.g. geometry, materials) from `Scene`.
-  * `pathtraceFree` frees memory allocated by `pathtraceInit`
-  * `pathtrace` performs one iteration of the rendering - it handles kernel
-    launches, memory copies, transferring some data, etc.
-    * See comments for a low-level path tracing recap.
-* `src/intersections.h`: ray intersection functions
-  * `boxIntersectionTest` and `sphereIntersectionTest`, which take in a ray and
-    a geometry object and return various properties of the intersection.
-* `src/interactions.h`: ray scattering functions
-  * `calculateRandomDirectionInHemisphere`: a cosine-weighted random direction
-    in a hemisphere. Needed for implementing diffuse surfaces.
-  * `scatterRay`: this function should perform all ray scattering, and will
-    call `calculateRandomDirectionInHemisphere`. See comments for details.
-* `src/main.cpp`: you don't need to do anything here, but you can change the
-  program to save `.hdr` image files, if you want (for postprocessing).
-
-### Generating random numbers
-
-```
-thrust::default_random_engine rng(hash(index));
-thrust::uniform_real_distribution<float> u01(0, 1);
-float result = u01(rng);
-```
-
-There is a convenience function for generating a random engine using a
-combination of index, iteration, and depth as the seed:
-
-```
-thrust::default_random_engine rng = random_engine(iter, index, depth);
-```
-
-### Specular Lighting
-
-In path tracing, like diffuse materials, specular materials are
-simulated using a probability distribution instead computing the
-strength of a ray bounce based on angles.
-
-Equations 7, 8, and 9 of
-[GPU Gems 3 chapter 20](http://http.developer.nvidia.com/GPUGems3/gpugems3_ch20.html)
-give the formulas for generating a random specular ray. (Note that
-there is a typographical error: χ in the text = ξ in the formulas.)
-
-Also see the notes in `scatterRay` for probability splits between
-diffuse/specular/other material types.
-
-### Handling Long-Running CUDA Threads
-
-By default, your GPU driver will probably kill a CUDA kernel if it runs for more than 5 seconds. There's a way to disable this timeout. Just beware of infinite loops - they may lock up your computer.
-
-> The easiest way to disable TDR for Cuda programming, assuming you have the NVIDIA Nsight tools installed, is to open the Nsight Monitor, click on "Nsight Monitor options", and under "General" set "WDDM TDR enabled" to false. This will change the registry setting for you. Close and reboot. Any change to the TDR registry setting won't take effect until you reboot. [Stack Overflow](http://stackoverflow.com/questions/497685/cuda-apps-time-out-fail-after-several-seconds-how-to-work-around-this)
-
-### Notes on GLM
-
-This project uses GLM for linear algebra.
-
-On NVIDIA cards pre-Fermi (pre-DX12), you may have issues with mat4-vec4
-multiplication. If you have one of these cards, be careful! If you have issues,
-you might need to grab `cudamat4` and `multiplyMV` from the
-[Fall 2014 project](https://github.com/CIS565-Fall-2014/Project3-Pathtracer).
-Let us know if you need to do this.
-
-### Scene File Format
-
-This project uses a custom scene description format. Scene files are flat text
-files that describe all geometry, materials, lights, cameras, and render
-settings inside of the scene. Items in the format are delimited by new lines,
-and comments can be added using C-style `// comments`.
-
-Materials are defined in the following fashion:
-
-* MATERIAL (material ID) //material header
-* RGB (float r) (float g) (float b) //diffuse color
-* SPECX (float specx) //specular exponent
-* SPECRGB (float r) (float g) (float b) //specular color
-* REFL (bool refl) //reflectivity flag, 0 for no, 1 for yes
-* REFR (bool refr) //refractivity flag, 0 for no, 1 for yes
-* REFRIOR (float ior) //index of refraction for Fresnel effects
-* EMITTANCE (float emittance) //the emittance strength of the material. Material is a light source iff emittance > 0.
-
-Cameras are defined in the following fashion:
-
-* CAMERA //camera header
-* RES (float x) (float y) //resolution
-* FOVY (float fovy) //vertical field of view half-angle. the horizonal angle is calculated from this and the reslution
-* ITERATIONS (float interations) //how many iterations to refine the image,
-  only relevant for supersampled antialiasing, depth of field, area lights, and
-  other distributed raytracing applications
-* DEPTH (int depth) //maximum depth (number of times the path will bounce)
-* FILE (string filename) //file to output render to upon completion
-* EYE (float x) (float y) (float z) //camera's position in worldspace
-* VIEW (float x) (float y) (float z) //camera's view direction
-* UP (float x) (float y) (float z) //camera's up vector
-
-Objects are defined in the following fashion:
-
-* OBJECT (object ID) //object header
-* (cube OR sphere OR mesh) //type of object, can be either "cube", "sphere", or
-  "mesh". Note that cubes and spheres are unit sized and centered at the
-  origin.
-* material (material ID) //material to assign this object
-* TRANS (float transx) (float transy) (float transz) //translation
-* ROTAT (float rotationx) (float rotationy) (float rotationz) //rotation
-* SCALE (float scalex) (float scaley) (float scalez) //scale
-
-Two examples are provided in the `scenes/` directory: a single emissive sphere,
-and a simple cornell box made using cubes for walls and lights and a sphere in
-the middle.
-
-## Third-Party Code Policy
-
-* Use of any third-party code must be approved by asking on our Google Group.
-* If it is approved, all students are welcome to use it. Generally, we approve
-  use of third-party code that is not a core part of the project. For example,
-  for the path tracer, we would approve using a third-party library for loading
-  models, but would not approve copying and pasting a CUDA function for doing
-  refraction.
-* Third-party code **MUST** be credited in README.md.
-* Using third-party code without its approval, including using another
-  student's code, is an academic integrity violation, and will, at minimum,
-  result in you receiving an F for the semester.
-
-## README
-
-Please see: [**TIPS FOR WRITING AN AWESOME README**](https://github.com/pjcozzi/Articles/blob/master/CIS565/GitHubRepo/README.md)
-
-* Sell your project.
-* Assume the reader has a little knowledge of path tracing - don't go into
-  detail explaining what it is. Focus on your project.
-* Don't talk about it like it's an assignment - don't say what is and isn't
-  "extra" or "extra credit." Talk about what you accomplished.
-* Use this to document what you've done.
-* *DO NOT* leave the README to the last minute! It is a crucial part of the
-  project, and we will not be able to grade you without a good README.
-
-In addition:
-
-* This is a renderer, so include images that you've made!
-* Be sure to back your claims for optimization with numbers and comparisons.
-* If you reference any other material, please provide a link to it.
-* You wil not be graded on how fast your path tracer runs, but getting close to
-  real-time is always nice!
-* If you have a fast GPU renderer, it is very good to show case this with a
-  video to show interactivity. If you do so, please include a link!
-
-### Analysis
-
-* Stream compaction helps most after a few bounces. Print and plot the
-  effects of stream compaction within a single iteration (i.e. the number of
-  unterminated rays after each bounce) and evaluate the benefits you get from
-  stream compaction.
-* Compare scenes which are open (like the given cornell box) and closed
-  (i.e. no light can escape the scene). Again, compare the performance effects
-  of stream compaction! Remember, stream compaction only affects rays which
-  terminate, so what might you expect?
-
-
-## Submit
-
-If you have modified any of the `CMakeLists.txt` files at all (aside from the
-list of `SOURCE_FILES`), you must test that your project can build in Moore
-100B/C. Beware of any build issues discussed on the Google Group.
-
-1. Open a GitHub pull request so that we can see that you have finished.
-   The title should be "Submission: YOUR NAME".
-2. Send an email to the TA (gmail: kainino1+cis565@) with:
-   * **Subject**: in the form of `[CIS565] Project N: PENNKEY`.
-   * Direct link to your pull request on GitHub.
-   * Estimate the amount of time you spent on the project.
-   * If there were any outstanding problems, or if you did any extra
-     work, *briefly* explain.
-   * Feedback on the project itself, if any.
diff --git a/img/cornell.2015-09-30_00-35-22z.1773samp.png b/img/cornell.2015-09-30_00-35-22z.1773samp.png
new file mode 100644
index 0000000..1853ccf
Binary files /dev/null and b/img/cornell.2015-09-30_00-35-22z.1773samp.png differ
diff --git a/img/cornell_alive_vs_depth.png b/img/cornell_alive_vs_depth.png
new file mode 100644
index 0000000..6d7f9b9
Binary files /dev/null and b/img/cornell_alive_vs_depth.png differ
diff --git a/img/cornell_bounce.png b/img/cornell_bounce.png
new file mode 100644
index 0000000..8a1ff47
Binary files /dev/null and b/img/cornell_bounce.png differ
diff --git a/img/cornell_dof.png b/img/cornell_dof.png
new file mode 100644
index 0000000..9b6d205
Binary files /dev/null and b/img/cornell_dof.png differ
diff --git a/img/cornell_glass.png b/img/cornell_glass.png
new file mode 100644
index 0000000..1853ccf
Binary files /dev/null and b/img/cornell_glass.png differ
diff --git a/img/cornell_initial_vs_time.png b/img/cornell_initial_vs_time.png
new file mode 100644
index 0000000..f4e8fe6
Binary files /dev/null and b/img/cornell_initial_vs_time.png differ
diff --git a/img/cornell_specular.png b/img/cornell_specular.png
new file mode 100644
index 0000000..bca56c0
Binary files /dev/null and b/img/cornell_specular.png differ
diff --git a/scenes/cornell.txt b/scenes/cornell.txt
index a18639e..cd3195c 100644
--- a/scenes/cornell.txt
+++ b/scenes/cornell.txt
@@ -6,13 +6,13 @@ SPECRGB     0 0 0
 REFL        0
 REFR        0
 REFRIOR     0
-EMITTANCE   5
+EMITTANCE   10
 
 // Diffuse white
 MATERIAL 1
 RGB         .98 .98 .98
 SPECEX      0
-SPECRGB     1 1 1
+SPECRGB     0.01 0.01 0.01
 REFL        0
 REFR        0
 REFRIOR     0
@@ -22,7 +22,7 @@ EMITTANCE   0
 MATERIAL 2
 RGB         .85 .35 .35
 SPECEX      0
-SPECRGB     1 1 1
+SPECRGB     0 0 0
 REFL        0
 REFR        0
 REFRIOR     0
@@ -32,6 +32,16 @@ EMITTANCE   0
 MATERIAL 3
 RGB         .35 .85 .35
 SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Mirror white
+MATERIAL 4
+RGB         .01 .01 .01
+SPECEX      0
 SPECRGB     1 1 1
 REFL        0
 REFR        0
@@ -45,6 +55,8 @@ FOVY        45
 ITERATIONS  5000
 DEPTH       8
 FILE        cornell
+FOCAL       11.5
+APERTURE    0.1
 EYE         0.0 5 10.5
 VIEW        0 0 -1
 UP          0 1 0
@@ -72,7 +84,7 @@ cube
 material 1
 TRANS       0 10 0
 ROTAT       0 0 90
-SCALE       .01 10 10
+SCALE       .1 10 10
 
 // Back wall
 OBJECT 3
@@ -80,7 +92,7 @@ cube
 material 1
 TRANS       0 5 -5
 ROTAT       0 90 0
-SCALE       .01 10 10
+SCALE       .1 10 10
 
 // Left wall
 OBJECT 4
@@ -88,7 +100,7 @@ cube
 material 2
 TRANS       -5 5 0
 ROTAT       0 0 0
-SCALE       .01 10 10
+SCALE       .1 10 10
 
 // Right wall
 OBJECT 5
@@ -96,12 +108,20 @@ cube
 material 3
 TRANS       5 5 0
 ROTAT       0 0 0
-SCALE       .01 10 10
+SCALE       .1 10 10
 
 // Sphere
 OBJECT 6
 sphere
-material 1
-TRANS       -1 4 -1
+material 4
+TRANS       -1 2 -3
 ROTAT       0 0 0
 SCALE       3 3 3
+
+// Sphere
+OBJECT 7
+sphere
+material 1
+TRANS       0 5 1
+ROTAT       0 0 0
+SCALE       2 2 2
diff --git a/scenes/cornell_move.txt b/scenes/cornell_move.txt
new file mode 100644
index 0000000..039d5ce
--- /dev/null
+++ b/scenes/cornell_move.txt
@@ -0,0 +1,136 @@
+// Emissive material (light)
+MATERIAL 0
+RGB         1 1 1
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   10
+
+// Diffuse white
+MATERIAL 1
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     0.01 0.01 0.01
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse red
+MATERIAL 2
+RGB         .85 .35 .35
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse green
+MATERIAL 3
+RGB         .35 .85 .35
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Mirror white
+MATERIAL 4
+RGB         .01 .01 .01
+SPECEX      0
+SPECRGB     1 1 1
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Camera
+CAMERA
+RES         800 800
+FOVY        45
+ITERATIONS  5000
+DEPTH       8
+FILE        cornell
+FOCAL       11.5
+APERTURE    0.5
+EYE         0.0 5 10.5
+VIEW        0 0 -1
+UP          0 1 0
+
+
+// Ceiling light
+OBJECT 0
+cube
+material 0
+TRANS       0 10 0
+ROTAT       0 0 0
+SCALE       3 .3 3
+
+// Floor
+OBJECT 1
+cube
+material 1
+TRANS       0 0 0
+ROTAT       0 0 0
+SCALE       10 .01 10
+
+// Ceiling
+OBJECT 2
+cube
+material 1
+TRANS       0 10 0
+ROTAT       0 0 90
+SCALE       .1 10 10
+
+// Back wall
+OBJECT 3
+cube
+material 1
+TRANS       0 5 -5
+ROTAT       0 90 0
+SCALE       .1 10 10
+
+// Left wall
+OBJECT 4
+cube
+material 2
+TRANS       -5 5 0
+ROTAT       0 0 0
+SCALE       .1 10 10
+
+// Right wall
+OBJECT 5
+cube
+material 3
+TRANS       5 5 0
+ROTAT       0 0 0
+SCALE       .1 10 10
+
+// Sphere background
+OBJECT 6
+sphere
+material 1
+TRANS       -1 2 -3
+ROTAT       0 0 0
+SCALE       3 3 3
+
+// Sphere focused
+OBJECT 7
+sphere
+material 3
+TRANS       0 5 1
+ROTAT       0 0 0
+SCALE       2 2 2
+
+// Sphere moving
+OBJECT 8
+sphere
+material 2
+TRANS       3 4 -3
+ROTAT       0 0 0
+SCALE       3 3 3
+MOVETO      3 1 -3
diff --git a/scenes/cornell_refraction.txt b/scenes/cornell_refraction.txt
new file mode 100644
index 0000000..948eb56
--- /dev/null
+++ b/scenes/cornell_refraction.txt
@@ -0,0 +1,137 @@
+// Emissive material (light)
+MATERIAL 0
+RGB         1 1 1
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   10
+
+// Diffuse white
+MATERIAL 1
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     0.01 0.01 0.01
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse red
+MATERIAL 2
+RGB         .85 .35 .35
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse green
+MATERIAL 3
+RGB         .35 .85 .35
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Mirror white
+MATERIAL 4
+RGB         .01 .01 .01
+SPECEX      0
+SPECRGB     1 1 1
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Glass
+MATERIAL 5
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     0.1 0.1 0.1
+REFL        0
+REFR        1
+REFRIOR     1.52
+EMITTANCE   0
+
+// Camera
+CAMERA
+RES         800 800
+FOVY        45
+ITERATIONS  5000
+DEPTH       8
+FILE        cornell
+FOCAL       11.5
+APERTURE    0.25
+EYE         0.0 5 10.5
+VIEW        0 0 -1
+UP          0 1 0
+
+
+// Ceiling light
+OBJECT 0
+cube
+material 0
+TRANS       0 10 0
+ROTAT       0 0 0
+SCALE       3 .3 3
+
+// Floor
+OBJECT 1
+cube
+material 1
+TRANS       0 0 0
+ROTAT       0 0 0
+SCALE       10 .01 10
+
+// Ceiling
+OBJECT 2
+cube
+material 1
+TRANS       0 10 0
+ROTAT       0 0 90
+SCALE       .1 10 10
+
+// Back wall
+OBJECT 3
+cube
+material 1
+TRANS       0 5 -5
+ROTAT       0 90 0
+SCALE       .1 10 10
+
+// Left wall
+OBJECT 4
+cube
+material 2
+TRANS       -5 5 0
+ROTAT       0 0 0
+SCALE       .1 10 10
+
+// Right wall
+OBJECT 5
+cube
+material 3
+TRANS       5 5 0
+ROTAT       0 0 0
+SCALE       .1 10 10
+
+// Sphere
+OBJECT 6
+sphere
+material 4
+TRANS       -1 2 -3
+ROTAT       0 0 0
+SCALE       3 3 3
+
+// Sphere
+OBJECT 7
+sphere
+material 5
+TRANS       -3 3 0
+ROTAT       0 0 0
+SCALE       3 3 3
diff --git a/scenes/sphere.txt b/scenes/sphere.txt
index c70d546..086b0cf 100644
--- a/scenes/sphere.txt
+++ b/scenes/sphere.txt
@@ -17,7 +17,7 @@ DEPTH       8
 FILE        sphere
 EYE         0.0 5 10.5
 VIEW        0 0 -1
-UP          0 1 0
+UP          0 1.2 0
 
 // Sphere
 OBJECT 0
diff --git a/src/interactions.h b/src/interactions.h
index 9386a09..4b744bc 100644
--- a/src/interactions.h
+++ b/src/interactions.h
@@ -9,7 +9,7 @@
  */
 __host__ __device__
 glm::vec3 calculateRandomDirectionInHemisphere(
-        glm::vec3 normal, thrust::default_random_engine &rng) {
+        const glm::vec3 normal, thrust::default_random_engine &rng) {
     thrust::uniform_real_distribution<float> u01(0, 1);
 
     float up = sqrt(u01(rng)); // cos(theta)
@@ -41,6 +41,37 @@ glm::vec3 calculateRandomDirectionInHemisphere(
         + sin(around) * over * perpendicularDirection2;
 }
 
+__device__ float schlick(glm::vec3 inc, glm::vec3 normal, float n1, float n2){
+	// Schlick's reflectance approximation
+	float R0 = (n1 - n2) * (n1 - n2) / ((n1 + n2) * (n1 + n2));
+	float cosi = -glm::dot(normal, inc);
+
+	if (n1 > n2){
+		float n = n1 / n2;
+		float sint2 = n * n * (1.0f - cosi * cosi);
+		if (sint2 > 1.0f) return 1.0f; // TIR
+		cosi = sqrt(1.0f - sint2);
+	}
+
+	float cosii = 1.0f - cosi;
+	float RT = R0 + (1.0f - R0)*cosii*cosii*cosii*cosii*cosii;
+
+	return RT;
+}
+
+__device__ void refract(Ray& ray, glm::vec3 normal, float n1, float n2){
+	float n = n1 / n2;
+	float cosi = -glm::dot(normal, ray.direction);
+	float sint2 = n * n * (1.0f - cosi*cosi);
+	if (sint2 > 1.0f){
+		ray.color = glm::vec3(0.0f);
+		ray.isAlive = false;
+	}
+	float cost = sqrt(1.0f - sint2);
+	ray.direction = n * ray.direction + (n * cosi - cost) * normal;
+	ray.direction = glm::vec3(0.0,1.0,0.0);
+}
+
 /**
  * Scatter a ray with some probabilities according to the material properties.
  * For example, a diffuse surface scatters in a cosine-weighted hemisphere.
@@ -64,15 +95,94 @@ glm::vec3 calculateRandomDirectionInHemisphere(
  *
  * You may need to change the parameter list for your purposes!
  */
-__host__ __device__
+__device__
 void scatterRay(
         Ray &ray,
-        glm::vec3 &color,
-        glm::vec3 intersect,
-        glm::vec3 normal,
+        const glm::vec3 intersect,
+        const glm::vec3 normal,
         const Material &m,
+		const Geom &g,
         thrust::default_random_engine &rng) {
-    // TODO: implement this.
     // A basic implementation of pure-diffuse shading will just call the
     // calculateRandomDirectionInHemisphere defined above.
-}
+
+	thrust::uniform_real_distribution<float> u01(0, 1);
+	float randNum = u01(rng);
+
+	// Light
+	if (m.emittance > 0.0f){
+		ray.color = ray.color * m.emittance;
+		ray.isAlive = false;
+		return;
+	}
+
+	float specProb = m.specular.color.r + m.specular.color.g + m.specular.color.b;
+	float diffuseProb = m.color.r + m.color.g + m.color.b;
+	float total = specProb + diffuseProb;
+	specProb = specProb / total;
+
+	// From Stanford notes
+	if (m.hasRefractive){
+		float n1 = 1.0f;
+		float n2 = m.indexOfRefraction;
+
+		ray.direction = glm::normalize(ray.direction);
+
+		float RT = schlick(ray.direction, normal, n1, n2);
+		if (RT >= 1.0f){
+			ray.color = glm::vec3(0.0f,1.0f,0.0f);
+			ray.isAlive = false;
+			return;
+		}
+
+		// Specular
+		//printf("%f\n",RT);
+		if (randNum < RT){
+			ray.direction = ray.direction - 2.0f * (glm::dot(ray.direction, normal)) * normal;
+			ray.color = ray.color * m.specular.color;
+			ray.origin = intersect;
+		}
+		else {
+			ray.direction = glm::normalize(ray.direction);
+			ray.direction = glm::refract(ray.direction, normal, n1 / n2);
+
+			ray.origin = intersect + 0.0003f * ray.direction;
+
+			glm::vec3 new_intersect;
+			glm::vec3 new_normal;
+			bool new_outside;
+			bool isIntersection;
+
+			if (g.type == SPHERE){
+				isIntersection = sphereIntersectionTest(g, ray, new_intersect, new_normal, new_outside);
+			}
+			else{
+				isIntersection = boxIntersectionTest(g, ray, new_intersect, new_normal, new_outside);
+			}
+
+			ray.direction = glm::refract(ray.direction, new_normal, n1 / n2);
+			ray.direction = glm::normalize(ray.direction);
+			if (ray.direction == glm::vec3(0.0)){
+				ray.color = glm::vec3(0.0,1.0,0.0);
+				ray.isAlive = false;
+			}
+
+			ray.origin = new_intersect + 0.0003f * ray.direction;
+		}
+		return;
+	}
+
+	// Specular
+	if (randNum < specProb){
+		ray.direction = ray.direction - 2.0f * (glm::dot(ray.direction, normal)) * normal;
+		ray.direction = glm::normalize(ray.direction);
+		ray.color = ray.color * m.specular.color * (1.0f/specProb);
+	}
+	// Diffuse
+	else{
+		ray.direction = calculateRandomDirectionInHemisphere(normal, rng);
+		ray.direction = glm::normalize(ray.direction);
+		ray.color = ray.color * m.color * (1.0f/(1.0f-specProb));
+	}
+	ray.origin = intersect;
+}
\ No newline at end of file
diff --git a/src/intersections.h b/src/intersections.h
index f34b89d..29d3500 100644
--- a/src/intersections.h
+++ b/src/intersections.h
@@ -44,7 +44,7 @@ __host__ __device__ glm::vec3 multiplyMV(glm::mat4 m, glm::vec4 v) {
  * @param outside            Output param for whether the ray came from outside.
  * @return                   Ray parameter `t` value. -1 if no intersection.
  */
-__host__ __device__ float boxIntersectionTest(Geom box, Ray r,
+__host__ __device__ float boxIntersectionTest(const Geom box, const Ray r,
         glm::vec3 &intersectionPoint, glm::vec3 &normal, bool &outside) {
     Ray q;
     q.origin    =                multiplyMV(box.inverseTransform, glm::vec4(r.origin   , 1.0f));
@@ -98,7 +98,7 @@ __host__ __device__ float boxIntersectionTest(Geom box, Ray r,
  * @param outside            Output param for whether the ray came from outside.
  * @return                   Ray parameter `t` value. -1 if no intersection.
  */
-__host__ __device__ float sphereIntersectionTest(Geom sphere, Ray r,
+__host__ __device__ float sphereIntersectionTest(const Geom sphere, const Ray r,
         glm::vec3 &intersectionPoint, glm::vec3 &normal, bool &outside) {
     float radius = .5;
 
diff --git a/src/pathtrace.cu b/src/pathtrace.cu
index e7ef1c6..1d09b0f 100644
--- a/src/pathtrace.cu
+++ b/src/pathtrace.cu
@@ -4,6 +4,9 @@
 #include <thrust/execution_policy.h>
 #include <thrust/random.h>
 #include <thrust/remove.h>
+#include <glm/gtc/matrix_inverse.hpp>
+
+//#include <stream_compaction/efficient.h">
 
 #include "sceneStructs.h"
 #include "scene.h"
@@ -14,9 +17,11 @@
 #include "intersections.h"
 #include "interactions.h"
 
+#define MAX_THREADS 64
 #define ERRORCHECK 1
 
 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
+//#define FILENAME "/d/Documents/cis565/hw3/test.txt"
 #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
 void checkCUDAErrorFn(const char *msg, const char *file, int line) {
 #if ERRORCHECK
@@ -67,10 +72,13 @@ __global__ void sendImageToPBO(uchar4* pbo, glm::ivec2 resolution,
     }
 }
 
-static Scene *hst_scene = NULL;
-static glm::vec3 *dev_image = NULL;
-// TODO: static variables for device memory, scene/camera info, etc
-// ...
+static Scene *hst_scene;
+static glm::vec3 *dev_image;
+static Ray* dev_rays;
+static Ray* dev_out_rays;
+static Geom* dev_geoms;
+static Material* dev_materials;
+
 
 void pathtraceInit(Scene *scene) {
     hst_scene = scene;
@@ -79,50 +87,148 @@ void pathtraceInit(Scene *scene) {
 
     cudaMalloc(&dev_image, pixelcount * sizeof(glm::vec3));
     cudaMemset(dev_image, 0, pixelcount * sizeof(glm::vec3));
-    // TODO: initialize the above static variables added above
+
+	const Geom* geoms = &(hst_scene->geoms)[0];
+	const Material* materials = &(hst_scene->materials)[0];
+
+	const int numObjects = hst_scene->geoms.size();
+	cudaMalloc((void**)&dev_rays, pixelcount*sizeof(Ray));
+	cudaMalloc((void**)&dev_out_rays, pixelcount*sizeof(Ray));
+	cudaMalloc((void**)&dev_geoms, numObjects*sizeof(Geom));
+	cudaMalloc((void**)&dev_materials, numObjects*sizeof(Material));
+
+	cudaMemcpy(dev_geoms, geoms, numObjects*sizeof(Geom), cudaMemcpyHostToDevice);
+	cudaMemcpy(dev_materials, materials, numObjects*sizeof(Material), cudaMemcpyHostToDevice);
 
     checkCUDAError("pathtraceInit");
 }
 
 void pathtraceFree() {
     cudaFree(dev_image);  // no-op if dev_image is null
-    // TODO: clean up the above static variables
-
+	cudaFree(dev_rays);
+	cudaFree(dev_geoms);
+	cudaFree(dev_materials);
     checkCUDAError("pathtraceFree");
 }
 
-/**
- * Example function to generate static and test the CUDA-GL interop.
- * Delete this once you're done looking at it!
- */
-__global__ void generateNoiseDeleteMe(Camera cam, int iter, glm::vec3 *image) {
-    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
-    int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+__global__ void initRays(int n, int iter, Camera cam, Ray* rays, float cam_pos_dx, float cam_pos_dy){
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
 
-    if (x < cam.resolution.x && y < cam.resolution.y) {
-        int index = x + (y * cam.resolution.x);
+	if (index < n){
+		int x = index % cam.resolution.x;
+		int y = index / cam.resolution.x;
+		glm::vec3 left = glm::cross(cam.up, cam.view);
 
-        thrust::default_random_engine rng = makeSeededRandomEngine(iter, index, 0);
-        thrust::uniform_real_distribution<float> u01(0, 1);
+		thrust::default_random_engine rng = makeSeededRandomEngine(iter, index, 0);
+		thrust::uniform_real_distribution<float> u01(-0.1, 0.1);
 
-        // CHECKITOUT: Note that on every iteration, noise gets added onto
-        // the image (not replaced). As a result, the image smooths out over
-        // time, since the output image is the contents of this array divided
-        // by the number of iterations.
-        //
-        // Your renderer will do the same thing, and, over time, it will become
-        // smoother.
-        image[index] += glm::vec3(u01(rng));
-    }
+		float res2x = cam.resolution.x / 2.0f;
+		float res2y = cam.resolution.y / 2.0f;
+
+		float magx = -(res2x - x + u01(rng))*sin(cam.fov.x) / res2x;
+		float magy = (res2y - y + u01(rng))*sin(cam.fov.y) / res2y;
+
+		glm::vec3 direction = cam.view + magx*left + magy*cam.up;
+		direction = glm::normalize(direction);
+
+		// Depth-of-field computations
+		// Source : https://www.cs.princeton.edu/courses/archive/fall00/cs426/lectures/raycast/sld017.htm
+		glm::vec3 focal_point = cam.position + cam.focal * cam.view;
+		float t = -(glm::dot(cam.position, direction) + glm::dot(focal_point, cam.view)) / ((glm::dot(direction,cam.view))+0.000001);
+		glm::vec3 intersection = cam.position + t*direction;
+
+		//rays[index].origin = cam.position;
+		rays[index].origin = cam.position + cam_pos_dx*left + cam_pos_dy*cam.up;
+		direction = intersection - rays[index].origin;
+
+		rays[index].direction = direction;
+		rays[index].color = glm::vec3(1.0);
+		rays[index].isAlive = true;
+		rays[index].index = index;
+	}
 }
 
+__global__ void intersect(int iter, int depth, int traceDepth, int n, Camera cam, Ray* rays, int numObjects, const Geom* geoms, const Material* materials){
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+	glm::vec3 normal;
+	glm::vec3 intersectionPoint;
+	float isIntersection;
+	bool outside;
+	
+	glm::vec3 minNormal;
+	glm::vec3 minIntersectionPoint;
+
+	float minDist = INFINITY;
+	int obj_index = -1;
+
+	if (index < n){
+
+		if (!rays[index].isAlive){
+			return;
+		}
+
+		if (depth == traceDepth - 1 && rays[index].isAlive){
+			rays[index].color = glm::vec3(0.0);
+			rays[index].isAlive = false;
+			return;
+		}
+
+		thrust::default_random_engine rng = makeSeededRandomEngine(iter, index, depth);
+		Ray ray = rays[index];
+
+		for (int i = 0; i < numObjects; i++){
+			if (geoms[i].type == SPHERE){
+				isIntersection = sphereIntersectionTest(geoms[i], ray, intersectionPoint, normal, outside);
+			}
+			else {
+				isIntersection = boxIntersectionTest(geoms[i], ray, intersectionPoint, normal, outside);
+			}
+
+			if (isIntersection > 0 && minDist > glm::distance(ray.origin, intersectionPoint)){
+				minNormal = normal;
+				minIntersectionPoint = intersectionPoint;
+				minDist = glm::distance(ray.origin, intersectionPoint);
+				obj_index = i;
+			}
+		}
+
+		if (obj_index >= 0){
+			scatterRay(rays[index], minIntersectionPoint, minNormal, materials[geoms[obj_index].materialid], geoms[obj_index], rng);
+
+		}
+		else{
+			rays[index].color = glm::vec3(0.0);
+			rays[index].isAlive = false;
+		}
+	}
+}
+
+__global__ void updatePixels(int n, Ray* rays, glm::vec3* image){
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+	if (index < n){
+		if (!rays[index].isAlive){
+			image[rays[index].index] += rays[index].color;
+		}
+	}
+}
+
+struct is_dead{
+	__host__ __device__
+	bool operator()(const Ray ray){
+		return !ray.isAlive;
+	}
+};
+
 /**
  * Wrapper for the __global__ call that sets up the kernel calls and does a ton
  * of memory management
  */
 void pathtrace(uchar4 *pbo, int frame, int iter) {
     const int traceDepth = hst_scene->state.traceDepth;
-    const Camera &cam = hst_scene->state.camera;
+	const Camera &cam = hst_scene->state.camera;
+	const int numObjects = hst_scene->geoms.size();
     const int pixelcount = cam.resolution.x * cam.resolution.y;
 
     const dim3 blockSize2d(8, 8);
@@ -130,36 +236,58 @@ void pathtrace(uchar4 *pbo, int frame, int iter) {
             (cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x,
             (cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y);
 
-    ///////////////////////////////////////////////////////////////////////////
+	// Initialize aperture for DOF
+	int numBlocks = (pixelcount-1) / MAX_THREADS + 1;
+
+	thrust::default_random_engine rng = makeSeededRandomEngine(iter, iter, 0);
+	thrust::uniform_real_distribution<float> uAp(-cam.aperture, cam.aperture);
+
+	float dx = uAp(rng);
+	float dy = uAp(rng);
+
+	// Initialize rays
+	initRays<<<numBlocks, MAX_THREADS>>>(pixelcount, iter, cam, dev_rays, dx, dy);
+	checkCUDAError("initRays");
+
+	// Handle movement (TODO: make this faster?)
+	Geom* geoms = &(hst_scene->geoms)[0];
+	glm::vec3 new_translation;
+	glm::vec3 new_direction;
+	float dist;
+	for (int i = 0; i < numObjects; i++){
+		if (geoms[i].moving){
+			new_direction = geoms[i].moveto - geoms[i].translation;
+			dist = glm::length(new_direction);
+			new_direction = glm::normalize(new_direction);
+			thrust::uniform_real_distribution<float> uMove(0, dist);
+			new_translation = geoms[i].translation + new_direction * uMove(rng);
 
-    // Recap:
-    // * Initialize array of path rays (using rays that come out of the camera)
-    //   * You can pass the Camera object to that kernel.
-    //   * Each path ray is a (ray, color) pair, where color starts as the
-    //     multiplicative identity, white = (1, 1, 1).
-    //   * For debugging, you can output your ray directions as colors.
-    // * For each depth:
-    //   * Compute one new (ray, color) pair along each path (using scatterRay).
-    //     Note that many rays will terminate by hitting a light or hitting
-    //     nothing at all. You'll have to decide how to represent your path rays
-    //     and how you'll mark terminated rays.
-    //     * Color is attenuated (multiplied) by reflections off of any object
-    //       surface.
-    //     * You can debug your ray-scene intersections by displaying various
-    //       values as colors, e.g., the first surface normal, the first bounced
-    //       ray direction, the first unlit material color, etc.
-    //   * Add all of the terminated rays' results into the appropriate pixels.
-    //   * Stream compact away all of the terminated paths.
-    //     You may use either your implementation or `thrust::remove_if` or its
-    //     cousins.
-    //     * Note that you can't really use a 2D kernel launch any more - switch
-    //       to 1D.
-    // * Finally, handle all of the paths that still haven't terminated.
-    //   (Easy way is to make them black or background-colored.)
-
-    // TODO: perform one iteration of path tracing
-
-    generateNoiseDeleteMe<<<blocksPerGrid2d, blockSize2d>>>(cam, iter, dev_image);
+			geoms[i].transform = utilityCore::buildTransformationMatrix(
+				new_translation, geoms[i].rotation, geoms[i].scale);
+			geoms[i].inverseTransform = glm::inverse(geoms[i].transform);
+			geoms[i].invTranspose = glm::inverseTranspose(geoms[i].transform);
+		}
+	}
+	cudaMemcpy(dev_geoms, geoms, numObjects*sizeof(Geom), cudaMemcpyHostToDevice);
+
+	// Path tracing
+	int numAlive = pixelcount;
+	Ray* last_ray;
+
+	for (int d = 0; d < traceDepth; d++){
+		numBlocks = (numAlive - 1) / MAX_THREADS + 1;
+		intersect<<<numBlocks, MAX_THREADS>>>(iter, d, traceDepth, numAlive, cam, dev_rays, numObjects, dev_geoms, dev_materials);
+		updatePixels<<<numBlocks, MAX_THREADS>>>(numAlive, dev_rays, dev_image);
+
+		numAlive = shared_compact(numAlive, dev_out_rays, dev_rays);
+		cudaMemcpy(dev_rays, dev_out_rays, numAlive*sizeof(Ray), cudaMemcpyDeviceToDevice);
+
+		//last_ray = thrust::remove_if(thrust::device, dev_rays, dev_rays + numAlive, is_dead());
+		//numAlive = last_ray - dev_rays;
+		if (numAlive == 0){
+			break;
+		}
+	}
 
     ///////////////////////////////////////////////////////////////////////////
 
@@ -172,3 +300,179 @@ void pathtrace(uchar4 *pbo, int frame, int iter) {
 
     checkCUDAError("pathtrace");
 }
+
+/*
+* Exclusive scan on idata, stores into odata, using shared memory
+*/
+__global__ void kernSharedScan(int n, int *odata, const int *idata){
+	extern __shared__ int temp[];
+
+	int index = threadIdx.x;
+
+	int offset = 1;
+
+	temp[2 * index] = idata[2 * index + (blockIdx.x*blockDim.x*2)];
+	temp[2 * index + 1] = idata[2 * index + 1 + (blockIdx.x*blockDim.x*2)];
+
+	for (int d = n >> 1; d > 0; d >>= 1){
+		__syncthreads();
+		if (index < d){
+			int ai = offset*(2 * index + 1) - 1;
+			int bi = offset*(2 * index + 2) - 1;
+			temp[bi] += temp[ai];
+		}
+		offset *= 2;
+	}
+
+	if (index == 0){
+		temp[n - 1] = 0;
+	}
+
+	for (int d = 1; d < n; d *= 2){
+		offset >>= 1;
+		__syncthreads();
+		if (index < d){
+			int ai = offset*(2 * index + 1) - 1;
+			int bi = offset*(2 * index + 2) - 1;
+			int t = temp[ai];
+			temp[ai] = temp[bi];
+			temp[bi] += t;
+		}
+	}
+	__syncthreads();
+	odata[2 * index + (blockIdx.x*blockDim.x*2)] = temp[2 * index];
+	odata[2 * index + 1 + (blockIdx.x*blockDim.x*2)] = temp[2 * index + 1];
+}
+
+template <typename T> __global__ void kernFindAlive(int n, int* odata, T* idata){
+	int index = (blockIdx.x*blockDim.x) + threadIdx.x;
+
+	if (index < n){
+		if (idata[index].isAlive){
+			odata[index] = 1;
+		}
+		else {
+			odata[index] = 0;
+		}
+		
+	}
+}
+
+template <typename T> __global__ void kernScatter(int n, T* odata, T* idata, int* bools, int* scan){
+	int index = (blockIdx.x*blockDim.x) + threadIdx.x;
+
+	if (index < n){
+		if (bools[index] == 1){
+			odata[scan[index]] = idata[index];
+		}
+	}
+}
+
+__global__ void kernGetLastInBlocks(int n, int blockSize, int* dev_odata, int* dev_idata, int* dev_orig_idata){
+	// n is the number of elements expected in the output array
+	int index = (blockIdx.x*blockDim.x) + threadIdx.x;
+
+	if (index < n){
+		dev_odata[index] = dev_idata[(index + 1)*blockSize - 1] + dev_orig_idata[(index + 1)*blockSize - 1];
+	}
+}
+
+__global__ void kernInPlaceIncrementBlocks(int n, int* dev_data, int* increment){
+	int index = (blockIdx.x*blockDim.x) + threadIdx.x;
+
+	if (index < n){
+		dev_data[index] = dev_data[index] + increment[blockIdx.x];
+	}
+}
+
+void blockwise_scan(int n, int* dev_odata, int* dev_idata){
+	int nfb = (n - 1) / MAX_THREADS + 1;
+	int n2 = nfb * MAX_THREADS;
+	int n_size = n * sizeof(int); // original input array size in bytes
+	int fb_size = nfb*MAX_THREADS*sizeof(int); // full block size in bytes
+	int shared_mem_size = MAX_THREADS * sizeof(int);
+
+	int* dev_idata_fb;
+	int* dev_odata_fb;
+
+	//TODO: only need this is non-power-of-2
+	cudaMalloc((void**)&dev_idata_fb, fb_size);
+	cudaMalloc((void**)&dev_odata_fb, fb_size);
+
+	cudaMemset(dev_idata_fb, 0, fb_size);
+	cudaMemcpy(dev_idata_fb, dev_idata, n_size, cudaMemcpyDeviceToDevice);
+
+	// Base case
+	if (nfb == 1){
+		kernSharedScan<<<nfb, MAX_THREADS/2, shared_mem_size>>>(MAX_THREADS, dev_odata_fb, dev_idata_fb);
+		cudaMemcpy(dev_odata, dev_odata_fb, n_size, cudaMemcpyDeviceToDevice);
+		cudaFree(dev_idata_fb);
+		cudaFree(dev_odata_fb);
+		return;
+	}
+
+	// Recurse
+	int* dev_block_increments;
+	int* dev_block_increments_scan;
+	int numBlocks = (nfb - 1) / MAX_THREADS + 1;
+	cudaMalloc((void**)&dev_block_increments, nfb*sizeof(int));
+	cudaMalloc((void**)&dev_block_increments_scan, nfb*sizeof(int));
+
+	kernSharedScan <<<nfb, MAX_THREADS/2, shared_mem_size>>>(MAX_THREADS, dev_odata_fb, dev_idata_fb);
+	//cudaDeviceSynchronize();
+
+	kernGetLastInBlocks<<<numBlocks, MAX_THREADS>>>(nfb, MAX_THREADS, dev_block_increments, dev_odata_fb, dev_idata_fb);
+	//cudaDeviceSynchronize();
+
+	blockwise_scan(nfb, dev_block_increments_scan, dev_block_increments);
+
+	kernInPlaceIncrementBlocks<<<nfb, MAX_THREADS>>>(n2, dev_odata_fb, dev_block_increments_scan);
+	//cudaDeviceSynchronize();
+
+	cudaMemcpy(dev_odata, dev_odata_fb, n_size, cudaMemcpyDeviceToDevice);
+
+	cudaFree(dev_idata_fb);
+	cudaFree(dev_odata_fb);
+	cudaFree(dev_block_increments);
+	cudaFree(dev_block_increments_scan);
+}
+
+template <typename T> int shared_compact(int n, T* dev_odata, T* dev_idata){
+	// Returns the number of elements remaining, elements after the return value in odata are undefined
+	// Assumes device memory
+
+	int numBlocks = (n - 1) / MAX_THREADS + 1;
+	int n_size = n * sizeof(int);
+	int n2_size = numBlocks*MAX_THREADS*sizeof(int);
+	int out_size = 0;
+
+	int* dev_temp;
+	int* dev_temp2;
+	int* dev_scan;
+
+	cudaMalloc((void**)&dev_temp, n_size);
+	cudaMalloc((void**)&dev_temp2, n2_size);
+	cudaMalloc((void**)&dev_scan, n2_size);
+
+	// Compute temp (binary)
+	kernFindAlive<<<numBlocks, MAX_THREADS>>>(n, dev_temp, dev_idata);
+	cudaDeviceSynchronize();
+
+	// Scan on temp
+	blockwise_scan(n, dev_scan, dev_temp);
+
+	// Scatter on scan
+	kernScatter<<<numBlocks, MAX_THREADS>>>(n, dev_odata, dev_idata, dev_temp, dev_scan);
+
+	// Compute outsize
+	int lastnum;
+	int lastbool;
+	cudaMemcpy(&lastnum, dev_scan + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+	cudaMemcpy(&lastbool, dev_temp + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+	out_size = lastnum + lastbool;
+
+	cudaFree(dev_temp);
+	cudaFree(dev_temp2);
+	cudaFree(dev_scan);
+	return out_size;
+}
diff --git a/src/pathtrace.h b/src/pathtrace.h
index 1241227..17c210a 100644
--- a/src/pathtrace.h
+++ b/src/pathtrace.h
@@ -6,3 +6,28 @@
 void pathtraceInit(Scene *scene);
 void pathtraceFree();
 void pathtrace(uchar4 *pbo, int frame, int iteration);
+
+inline int ilog2(int x) {
+	int lg = 0;
+	while (x >>= 1) {
+		++lg;
+	}
+	return lg;
+}
+
+inline int ilog2ceil(int x) {
+	return ilog2(x - 1) + 1;
+}
+
+// Kernels
+__global__ void kernSharedScan(int n, int *odata, const int *idata);
+template <typename T> __global__ void kernFindAlive(int n, int* odata, T* idata);
+template <typename T> __global__ void kernScatter(int n, T* odata, T* idata, int* bools, int* scan);
+template <typename T, typename Predicate> __global__ void kernMapToBool(int n, int* odata, T* idata, Predicate pred);
+__global__ void kernInPlaceIncrementBlocks(int n, int* dev_data, int* increment);
+__global__ void shared_scan(int n, int *odata, const int *idata);
+
+
+// Normal functions
+void blockwise_scan(int n, int* dev_odata, int* dev_idata);
+template <typename T> int shared_compact(int n, T* dev_odata, T* dev_idata);
\ No newline at end of file
diff --git a/src/scene.cpp b/src/scene.cpp
index 5804ce3..dd4e5c1 100644
--- a/src/scene.cpp
+++ b/src/scene.cpp
@@ -63,6 +63,7 @@ int Scene::loadGeom(string objectid) {
         }
 
         //load transformations
+		newGeom.moving = false;
         utilityCore::safeGetline(fp_in, line);
         while (!line.empty() && fp_in.good()) {
             vector<string> tokens = utilityCore::tokenizeString(line);
@@ -74,7 +75,10 @@ int Scene::loadGeom(string objectid) {
                 newGeom.rotation = glm::vec3(atof(tokens[1].c_str()), atof(tokens[2].c_str()), atof(tokens[3].c_str()));
             } else if (strcmp(tokens[0].c_str(), "SCALE") == 0) {
                 newGeom.scale = glm::vec3(atof(tokens[1].c_str()), atof(tokens[2].c_str()), atof(tokens[3].c_str()));
-            }
+			} else if (strcmp(tokens[0].c_str(), "MOVETO") == 0) {
+				newGeom.moving = true;
+				newGeom.moveto = glm::vec3(atof(tokens[1].c_str()), atof(tokens[2].c_str()), atof(tokens[3].c_str()));
+			}
 
             utilityCore::safeGetline(fp_in, line);
         }
@@ -96,14 +100,14 @@ int Scene::loadCamera() {
     float fovy;
 
     //load static properties
-    for (int i = 0; i < 5; i++) {
+    for (int i = 0; i < 7; i++) {
         string line;
         utilityCore::safeGetline(fp_in, line);
         vector<string> tokens = utilityCore::tokenizeString(line);
         if (strcmp(tokens[0].c_str(), "RES") == 0) {
             camera.resolution.x = atoi(tokens[1].c_str());
             camera.resolution.y = atoi(tokens[2].c_str());
-        } else if (strcmp(tokens[0].c_str(), "FOVY") == 0) {
+		} else if (strcmp(tokens[0].c_str(), "FOVY") == 0) {
             fovy = atof(tokens[1].c_str());
         } else if (strcmp(tokens[0].c_str(), "ITERATIONS") == 0) {
             state.iterations = atoi(tokens[1].c_str());
@@ -111,7 +115,11 @@ int Scene::loadCamera() {
             state.traceDepth = atoi(tokens[1].c_str());
         } else if (strcmp(tokens[0].c_str(), "FILE") == 0) {
             state.imageName = tokens[1];
-        }
+		} else if (strcmp(tokens[0].c_str(), "FOCAL") == 0){
+			camera.focal = atof(tokens[1].c_str());
+		} else if (strcmp(tokens[0].c_str(), "APERTURE") == 0){
+			camera.aperture = atof(tokens[1].c_str());
+		}
     }
 
     string line;
@@ -161,7 +169,7 @@ int Scene::loadMaterial(string materialid) {
             if (strcmp(tokens[0].c_str(), "RGB") == 0) {
                 glm::vec3 color( atof(tokens[1].c_str()), atof(tokens[2].c_str()), atof(tokens[3].c_str()) );
                 newMaterial.color = color;
-            } else if (strcmp(tokens[0].c_str(), "SPECEX") == 0) {
+			} else if (strcmp(tokens[0].c_str(), "SPECEX") == 0) {
                 newMaterial.specular.exponent = atof(tokens[1].c_str());
             } else if (strcmp(tokens[0].c_str(), "SPECRGB") == 0) {
                 glm::vec3 specColor(atof(tokens[1].c_str()), atof(tokens[2].c_str()), atof(tokens[3].c_str()));
diff --git a/src/sceneStructs.h b/src/sceneStructs.h
index baa2e30..351ece7 100644
--- a/src/sceneStructs.h
+++ b/src/sceneStructs.h
@@ -13,12 +13,17 @@ enum GeomType {
 struct Ray {
     glm::vec3 origin;
     glm::vec3 direction;
+	bool isAlive;
+	glm::vec3 color;
+	int index;
 };
 
 struct Geom {
     enum GeomType type;
     int materialid;
     glm::vec3 translation;
+	bool moving;
+	glm::vec3 moveto;
     glm::vec3 rotation;
     glm::vec3 scale;
     glm::mat4 transform;
@@ -44,6 +49,8 @@ struct Camera {
     glm::vec3 view;
     glm::vec3 up;
     glm::vec2 fov;
+	float focal;
+	float aperture;
 };
 
 struct RenderState {
diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt
index ac358c9..bcc484e 100644
--- a/stream_compaction/CMakeLists.txt
+++ b/stream_compaction/CMakeLists.txt
@@ -1,4 +1,16 @@
 set(SOURCE_FILES
+    "common.h"
+    "common.cu"
+    "cpu.h"
+    "cpu.cu"
+    "naive.h"
+    "naive.cu"
+    "efficient.h"
+    "efficient.cu"
+    "thrust.h"
+    "thrust.cu"
+    "radix.h"
+    "radix.cu"
     )
 
 cuda_add_library(stream_compaction
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
new file mode 100644
index 0000000..e5d4dfc
--- /dev/null
+++ b/stream_compaction/common.cu
@@ -0,0 +1,49 @@
+#include "common.h"
+
+void checkCUDAErrorFn(const char *msg, const char *file, int line) {
+    cudaError_t err = cudaGetLastError();
+    if (cudaSuccess == err) {
+        return;
+    }
+
+    fprintf(stderr, "CUDA error");
+    if (file) {
+        fprintf(stderr, " (%s:%d)", file, line);
+    }
+    fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err));
+    exit(EXIT_FAILURE);
+}
+
+
+namespace StreamCompaction {
+namespace Common {
+
+/**
+ * Maps an array to an array of 0s and 1s for stream compaction. Elements
+ * which map to 0 will be removed, and elements which map to 1 will be kept.
+ */
+__global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
+	int k = threadIdx.x + (blockIdx.x * blockDim.x);
+
+	if (k < n){
+		bools[k] = !!idata[k];
+	}
+}
+
+/**
+ * Performs scatter on an array. That is, for each element in idata,
+ * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]].
+ */
+__global__ void kernScatter(int n, int *odata,
+        const int *idata, const int *bools, const int *indices) {
+	int k = threadIdx.x + (blockIdx.x * blockDim.x);
+
+	if (k < n){
+		if (bools[k] == 1){
+			odata[indices[k]] = idata[k];
+		}
+	}
+}
+
+}
+}
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
new file mode 100644
index 0000000..ae1c41b
--- /dev/null
+++ b/stream_compaction/common.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <cstdio>
+#include <cstring>
+#include <cmath>
+
+#define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
+#define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
+
+#define MAXTHREADS 512
+
+/**
+ * Check for CUDA errors; print and exit if there was a problem.
+ */
+void checkCUDAErrorFn(const char *msg, const char *file = NULL, int line = -1);
+
+inline int ilog2(int x) {
+    int lg = 0;
+    while (x >>= 1) {
+        ++lg;
+    }
+    return lg;
+}
+
+inline int ilog2ceil(int x) {
+    return ilog2(x - 1) + 1;
+}
+
+
+namespace StreamCompaction {
+namespace Common {
+    __global__ void kernMapToBoolean(int n, int *bools, const int *idata);
+
+    __global__ void kernScatter(int n, int *odata,
+            const int *idata, const int *bools, const int *indices);
+}
+}
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
new file mode 100644
index 0000000..cedfda7
--- /dev/null
+++ b/stream_compaction/cpu.cu
@@ -0,0 +1,108 @@
+#include <cstdio>
+#include "cpu.h"
+#include <ctime>
+#include <cuda.h>
+
+namespace StreamCompaction {
+namespace CPU {
+
+/**
+ * CPU scan (prefix sum).
+ */
+void scan(int n, int *odata, const int *idata) {
+
+	cudaEvent_t start,stop;
+	cudaEventCreate(&start);
+	cudaEventCreate(&stop);
+
+	cudaEventRecord(start);
+
+	odata[0] = 0;
+	for (int i=1; i<n; i++){
+		odata[i] = odata[i-1] + idata[i-1];
+	}
+
+	cudaEventRecord(stop);
+	cudaEventSynchronize(stop);
+	float ms;
+	cudaEventElapsedTime(&ms, start, stop);
+	printf("cpu scan (s) cuda: %f\n", ms/1000.0);
+}
+
+/**
+ * CPU stream compaction without using the scan function.
+ *
+ * @returns the number of elements remaining after compaction.
+ */
+int compactWithoutScan(int n, int *odata, const int *idata) {
+	cudaEvent_t start,stop;
+	cudaEventCreate(&start);
+	cudaEventCreate(&stop);
+
+	cudaEventRecord(start);
+
+	int c = 0;
+	for (int i=0; i<n; i++){
+		if(idata[i] != 0){
+			odata[c] = idata[i];
+			c++;
+		}
+	}
+	cudaEventRecord(stop);
+	cudaEventSynchronize(stop);
+	float ms;
+	cudaEventElapsedTime(&ms, start, stop);
+	printf("cpu compact w/o scan (s) cuda: %f\n", ms/1000.0);
+
+    return c;
+}
+
+/**
+ * CPU stream compaction using scan and scatter, like the parallel version.
+ *
+ * @returns the number of elements remaining after compaction.
+ */
+int compactWithScan(int n, int *odata, const int *idata) {
+
+	cudaEvent_t start,stop;
+	cudaEventCreate(&start);
+	cudaEventCreate(&stop);
+
+	cudaEventRecord(start);
+
+	// Temp
+	int* temp = new int[n];
+	for (int i=0; i<n; i++){
+		if(idata[i] != 0){
+			temp[i] = 1;
+		} else {
+			temp[i] = 0;
+		}
+	}
+
+	// Scan
+	int* scan_arr = new int[n];
+	scan(n, scan_arr, temp);
+
+	// Number of elements in the final array
+	int c = scan_arr[n-1] + temp[n-1];
+
+	// Scatter
+	for(int i=0; i<n; i++){
+		if (temp[i] == 1){
+			int oind = scan_arr[i];
+			odata[oind] = idata[i];
+		}
+	}
+
+	cudaEventRecord(stop);
+	cudaEventSynchronize(stop);
+	float ms;
+	cudaEventElapsedTime(&ms, start, stop);
+	printf("cpu compact w/ scan (s) cuda: %f\n", ms/1000.0);
+
+    return c;
+}
+
+}
+}
diff --git a/stream_compaction/cpu.h b/stream_compaction/cpu.h
new file mode 100644
index 0000000..af4e7bd
--- /dev/null
+++ b/stream_compaction/cpu.h
@@ -0,0 +1,22 @@
+#pragma once
+inline int ilog2(int x) {
+	int lg = 0;
+	while (x >>= 1) {
+		++lg;
+	}
+	return lg;
+}
+
+inline int ilog2ceil(int x) {
+	return ilog2(x - 1) + 1;
+}
+
+namespace StreamCompaction {
+namespace CPU {
+    void scan(int n, int *odata, const int *idata);
+
+    int compactWithoutScan(int n, int *odata, const int *idata);
+
+    int compactWithScan(int n, int *odata, const int *idata);
+}
+}
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
new file mode 100644
index 0000000..0378010
--- /dev/null
+++ b/stream_compaction/efficient.cu
@@ -0,0 +1,250 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "common.h"
+#include "efficient.h"
+
+#define MAX_THREADS 512
+
+namespace StreamCompaction {
+namespace Efficient {
+
+// TODO: __global__
+
+__global__ void upsweep(int n, int d, int* idata){
+	int k = threadIdx.x + (blockIdx.x * blockDim.x);
+
+	if (k < n){
+		int d1 = d + 1;
+		int powd = 1 << d;
+		int powd1 = 1 << d1;
+		if (k % (powd1) == 0){
+			idata[k + powd1 - 1] += idata[k + powd - 1];
+		}
+	}
+}
+
+__global__ void downsweep(int n, int d, int* idata){
+	int k = threadIdx.x + (blockIdx.x * blockDim.x);
+
+	if (k < n){
+		int d1 = d + 1;
+		int powd = 1 << d;
+		int powd1 = 1 << d1;
+		if (k % (powd1) == 0){
+			int t = idata[k + powd - 1];
+			idata[k + powd - 1] = idata[k + powd1 - 1];
+			idata[k + powd1 - 1] += t;
+		}
+	}
+}
+
+/*
+* Exclusive scan on idata, stores into odata, using shared memory
+*/
+__global__ void shared_scan(int n, int *odata, const int *idata){
+	__shared__ int* temp;
+
+	int index = threadIdx.x;
+	int offset = 1;
+
+	temp[2 * index] = idata[2 * index];
+	temp[2 * index + 1] = idata[2 * index + 1];
+
+	for (int d = n >> 1; d > 0; d >>= 1){
+		__syncthreads();
+		if (index < d){
+			int ai = offset*(2 * index + 1) - 1;
+			int bi = offset*(2 * index + 2) - 1;
+			temp[bi] += temp[ai];
+		}
+	}
+	offset *= 2;
+	if (index == 0){
+		temp[n - 1] = 0;
+	}
+
+	for (int d = 1; d < n; d *= 2){
+		offset >>= 1;
+		__syncthreads();
+		if (index < d){
+			int ai = offset*(2 * index + 1) - 1;
+			int bi = offset*(2 * index + 2) - 1;
+			int t = temp[ai];
+			temp[ai] = temp[bi];
+			temp[bi] += t;
+		}
+	}
+	__syncthreads();
+	odata[2 * index] = temp[2 * index];
+	odata[2 * index + 1] = temp[2 * index + 1];
+}
+
+template <typename T, typename Predicate> __global__ void kernMapToBoolean(int n, int* odata, T* idata, Predicate pred){
+	int index = (blockIdx.x*blockDim.x)+threadIdx.x;
+
+	if (index < n){
+		odata[index] = pred(idata[index]);
+	}
+}
+
+template <typename T> __global__ void kernScatter(int n, T* odata, T* idata, int* bools, int* scan){
+	int index = (blockIdx.x*blockDim.x) + threadIdx.x;
+
+	if (index < n){
+		if (bools[index] == 1){
+			odata[scan[index]] = idata[index];
+		}
+	}
+}
+
+template <typename T, typename Predicate> int shared_compact(int n, T* dev_odata, T* dev_idata, Predicate pred){
+	// Returns the number of elements remaining, elements after the return value in odata are undefined
+	// Assumes device memory
+	int td = ilog2ceil(n);
+	int n2 = (int)pow(2, td);
+
+	int numBlocks = (n - 1) / MAX_THREADS + 1;
+	int numBlocks2 = (n2 - 1) / MAX_THREADS + 1;
+	int n_size = n * sizeof(int);
+	int n2_size = n2 * sizeof(int);
+	int out_size = 0;
+
+	int* dev_temp;
+	int* dev_temp_n2;
+	int* dev_scan;
+
+	cudaMalloc((void**)&dev_temp, n_size);
+	cudaMalloc((void**)&dev_temp_n2, n2_size);
+	cudaMalloc((void**)&dev_scan, n2_size);
+
+	// Compute temp (binary)
+	kernMapToBoolean<<<numBlocks, MAX_THREADS>>>(n, dev_temp, dev_idata, pred);
+
+	// Scan on temp
+	cudaMemcpy(dev_temp_n2, dev_temp, n_size, cudaMemcpyDeviceToDevice); // Grow temp
+	cudaMemset(dev_temp_n2 + n, 0, n2_size - n_size); // Pad with 0's
+	shared_scan<<<numBlocks2, MAX_THREADS>>>(n2, dev_scan, dev_temp_n2);
+	
+	// Scatter on scan
+	kernScatter<<<numBlocks, MAX_THREADS>>>(n, dev_odata, dev_idata, dev_temp, dev_scan);
+
+	// Compute outsize
+	int lastnum;
+	int lastbool;
+	cudaMemcpy(&lastnum, dev_scan + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+	cudaMemcpy(&lastbool, dev_temp + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+	out_size = lastnum + lastbool;
+	return out_size;
+}
+
+/**
+ * Performs prefix-sum (aka scan) on idata, storing the result into odata.
+ */
+void scan(int n, int *odata, const int *idata) {
+    // Compute log-rounded n
+	int td = ilog2ceil(n);
+	int n2 = (int)pow(2,td);
+
+	int numBlocks = (n2 - 1) / MAXTHREADS + 1;
+
+	int n_size = n * sizeof(int);
+	int n2_size = n2 * sizeof(int);
+
+	int* hst_idata2 = new int[n2]();
+	memcpy(hst_idata2, idata, n_size);
+
+	int* dev_idata;
+	cudaMalloc((void**)&dev_idata, n2_size);
+	cudaMemcpy(dev_idata, hst_idata2, n2_size, cudaMemcpyHostToDevice);
+
+	cudaEvent_t start, stop;
+	cudaEventCreate(&start);
+	cudaEventCreate(&stop);
+	cudaEventRecord(start);
+
+	// Scan
+	int powd, powd1;
+	for(int d=0; d<td; d++){
+		upsweep<<<numBlocks,MAXTHREADS>>>(n2, d, dev_idata);
+	}
+
+	cudaMemset((void*)&dev_idata[n2-1],0,sizeof(int));
+	for(int d=td-1; d>=0; d--){
+		downsweep<<<numBlocks,MAXTHREADS>>>(n2, d, dev_idata);
+	}
+
+	cudaEventRecord(stop);
+	cudaEventSynchronize(stop);
+	float ms = 0;
+	cudaEventElapsedTime(&ms, start, stop);
+	//printf("efficient scan time (s): %f\n",ms/1000.0);
+
+	// Remove leftover (from the log-rounded portion)
+	// No need to shift in this one I guess?
+	cudaMemcpy(odata, dev_idata, n_size, cudaMemcpyDeviceToHost);
+}
+
+/**
+ * Performs stream compaction on idata, storing the result into odata.
+ * All zeroes are discarded.
+ *
+ * @param n      The number of elements in idata.
+ * @param odata  The array into which to store elements.
+ * @param idata  The array of elements to compact.
+ * @returns      The number of elements remaining after compaction.
+ */
+int compact(int n, int *odata, const int *idata) {
+	int n_size = n*sizeof(int);
+	int numBlocks = (n - 1) / MAXTHREADS + 1;
+	int on = -1;
+
+	// Initialize memory
+	int* hst_nz = (int*)malloc(n_size);
+	
+	int* dev_nz;
+	int* dev_idata;
+	int* dev_scan;
+	int* dev_odata;
+	
+	cudaMalloc((void**)&dev_nz, n_size);
+	cudaMalloc((void**)&dev_idata, n_size);
+	cudaMalloc((void**)&dev_scan, n_size);
+	cudaMalloc((void**)&dev_odata, n_size);
+
+	// Nonzero
+	cudaMemcpy(dev_idata, idata, n_size, cudaMemcpyHostToDevice);
+
+	cudaEvent_t start, stop;
+	cudaEventCreate(&start);
+	cudaEventCreate(&stop);
+	cudaEventRecord(start);
+
+	StreamCompaction::Common::kernMapToBoolean<<<numBlocks,MAXTHREADS>>>(n, dev_nz, dev_idata);
+	cudaDeviceSynchronize();
+
+	// TODO: technically only need the last element here
+	cudaMemcpy(hst_nz, dev_nz, n_size, cudaMemcpyDeviceToHost);
+
+	// Scan
+	int* hst_scan = (int*)malloc(n_size);
+	scan(n, hst_scan, hst_nz);
+	on = hst_scan[n-1] + hst_nz[n-1];
+
+	// Scatter
+	cudaMemcpy(dev_scan, hst_scan, n_size, cudaMemcpyHostToDevice);
+	StreamCompaction::Common::kernScatter<<<numBlocks,MAXTHREADS>>>(n, dev_odata, dev_idata, dev_nz, dev_scan);
+	cudaDeviceSynchronize();
+
+	cudaEventRecord(stop);
+	cudaEventSynchronize(stop);
+	float ms = 0;
+	cudaEventElapsedTime(&ms, start, stop);
+	printf("efficient compact time (s): %f\n",ms/1000.0);
+
+	cudaMemcpy(odata, dev_odata, n_size, cudaMemcpyDeviceToHost);
+
+	return on;
+}
+
+}
+}
diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h
new file mode 100644
index 0000000..bea70b1
--- /dev/null
+++ b/stream_compaction/efficient.h
@@ -0,0 +1,9 @@
+#pragma once
+
+namespace StreamCompaction {
+namespace Efficient {
+    void scan(int n, int *odata, const int *idata);
+    int compact(int n, int *odata, const int *idata);
+	template <typename T, typename Predicate> int shared_compact(int n, T* odata, T* idata, Predicate pred);
+}
+}
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
new file mode 100644
index 0000000..eaab4fe
--- /dev/null
+++ b/stream_compaction/naive.cu
@@ -0,0 +1,68 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "common.h"
+#include "naive.h"
+
+namespace StreamCompaction {
+namespace Naive {
+
+// TODO: __global__
+__global__ void kernScan(int n, int powd, int* odata, int* idata){
+	int i = threadIdx.x + (blockIdx.x * blockDim.x);
+	//int powd = (int)pow(2,d);
+
+	if (i < n){
+		if (i >= powd){
+			odata[i] = idata[i - powd] + idata[i];
+		} else {
+			odata[i] = idata[i];
+		}
+	}
+}
+
+/**
+ * Performs prefix-sum (aka scan) on idata, storing the result into odata.
+ */
+void scan(int n, int *odata, const int *idata) {
+	// Initialize
+	int td = ilog2ceil(n);
+	int n2 = (int)pow(2,td);
+	int numBlocks = (n2-1) / MAXTHREADS + 1;
+
+	int n_size = n * sizeof(int);
+	int n2_size = n2 * sizeof(int);
+
+	int* dev_idata;
+	int* dev_odata;
+
+	cudaMalloc((void**)&dev_idata, n2_size);
+	cudaMalloc((void**)&dev_odata, n2_size);
+	cudaMemcpy(dev_idata, idata, n_size, cudaMemcpyHostToDevice);
+	cudaMemset(dev_idata+n, 0, n2_size-n_size);
+
+	cudaEvent_t start, stop;
+	cudaEventCreate(&start);
+	cudaEventCreate(&stop);
+	
+	// Scan
+	cudaEventRecord(start);
+	for(int d=1; d<=td; d++){
+		int powd = 1 << (d-1);
+		kernScan<<<numBlocks,MAXTHREADS>>>(n2, powd, dev_odata, dev_idata);
+		cudaThreadSynchronize();
+		cudaMemcpy(dev_idata, dev_odata, n2_size, cudaMemcpyDeviceToDevice);
+	}
+	cudaEventRecord(stop);
+	cudaEventSynchronize(stop);
+	float ms = 0;
+	cudaEventElapsedTime(&ms, start, stop);
+	printf("naive time(s): %f\n", ms/1000.0);
+
+	// Remove leftover (from the log-rounded portion)
+	// Do a shift right to make it an exclusive sum
+	odata[0] = 0;
+	cudaMemcpy(odata+1, dev_odata, n_size-sizeof(int), cudaMemcpyDeviceToHost);
+}
+
+}
+}
diff --git a/stream_compaction/naive.h b/stream_compaction/naive.h
new file mode 100644
index 0000000..21152d6
--- /dev/null
+++ b/stream_compaction/naive.h
@@ -0,0 +1,7 @@
+#pragma once
+
+namespace StreamCompaction {
+namespace Naive {
+    void scan(int n, int *odata, const int *idata);
+}
+}
diff --git a/stream_compaction/radix.cu b/stream_compaction/radix.cu
new file mode 100644
index 0000000..b1c8721
--- /dev/null
+++ b/stream_compaction/radix.cu
@@ -0,0 +1,147 @@
+#include "radix.h"
+#include "common.h"
+#include "efficient.h"
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+namespace StreamCompaction {
+namespace Radix {
+
+/**
+ * Maps an array to an array of 0s and 1s for stream compaction. Elements
+ * which map to 0 will be removed, and elements which map to 1 will be kept.
+ */
+__global__ void kernMapToBits(int n, int bit, int *bools, const int *idata) {
+	int k = threadIdx.x + (blockIdx.x * blockDim.x);
+
+	if (k < n){
+		int b = 1 << bit;
+		bools[k] = (b & idata[k]) >> bit;
+	}
+}
+
+__global__ void kernNegate(int n, int* odata, const int* idata){
+	int k = threadIdx.x + (blockIdx.x * blockDim.x);
+
+	if (k < n){
+		odata[k] = !idata[k];
+	}
+}
+
+__global__ void kernSumFalses(int n, int totalFalses, int* odata, const int* idata){
+	int k = threadIdx.x + (blockIdx.x * blockDim.x);
+
+	if (k < n){
+		odata[k] = k - idata[k] + totalFalses;
+	}
+}
+
+__global__ void kernMux(int n, int* odata, const int* bdata, const int* tdata, const int* fdata){
+	int k = threadIdx.x + (blockIdx.x * blockDim.x);
+
+	if (k < n){
+		odata[k] = bdata[k] ? tdata[k] : fdata[k];
+	}
+}
+
+__global__ void kernScatter(int n, int *odata,
+	const int *idata, const int *indices) {
+	int k = threadIdx.x + (blockIdx.x * blockDim.x);
+
+	if (k < n){
+		odata[indices[k]] = idata[k];
+	}
+}
+
+void split(int n, int bit, int* odata, int* idata){
+	int numBlocks = (n - 1) / MAXTHREADS + 1;
+	int n_size = n * sizeof(int);
+
+	int* hst_e;
+	int* hst_f;
+	int* dev_idata;
+	int* dev_odata;
+	int* dev_b;
+	int* dev_e;
+	int* dev_f;
+	int* dev_t;
+	int* dev_d;
+	int totalFalses;
+
+	hst_e = (int*)malloc(n_size);
+	hst_f = (int*)malloc(n_size);
+	cudaMalloc((void**)&dev_idata, n_size);
+	cudaMalloc((void**)&dev_odata, n_size);
+	cudaMalloc((void**)&dev_b, n_size);
+	cudaMalloc((void**)&dev_e, n_size);
+	cudaMalloc((void**)&dev_f, n_size);
+	cudaMalloc((void**)&dev_t, n_size);
+	cudaMalloc((void**)&dev_d, n_size);
+
+	cudaMemcpy(dev_idata, idata, n_size, cudaMemcpyHostToDevice);
+
+	// b - get bits
+	kernMapToBits<<<numBlocks, MAXTHREADS>>>(n, bit, dev_b, dev_idata);
+	cudaDeviceSynchronize();
+
+	int* hst_b = (int*)malloc(n_size);
+
+	// e - negate bits
+	kernNegate<<<numBlocks, MAXTHREADS>>>(n, dev_e, dev_b);
+	cudaDeviceSynchronize();
+
+	// f - exclusive scan on e
+	cudaMemcpy(hst_e, dev_e, n_size, cudaMemcpyDeviceToHost);
+	StreamCompaction::Efficient::scan(n, hst_f, hst_e);
+	cudaMemcpy(dev_f, hst_f, n_size, cudaMemcpyHostToDevice);
+	cudaDeviceSynchronize();
+
+	// t - t[i] = i - f[i] + totalFalses
+	int en1;
+	cudaMemcpy(&en1, dev_e + (n - 1), sizeof(int), cudaMemcpyDeviceToHost);
+	totalFalses = en1 + hst_f[n - 1];
+	kernSumFalses<<<numBlocks, MAXTHREADS>>>(n, totalFalses, dev_t, dev_f);
+	cudaDeviceSynchronize();
+
+	// d - d[i] = b[i] ? t[i] : f[i]
+	kernMux<<<numBlocks, MAXTHREADS>>>(n, dev_d, dev_b, dev_t, dev_f);
+	cudaDeviceSynchronize();
+
+	// scatter idata according to d
+	kernScatter<<<numBlocks, MAXTHREADS>>>(n, dev_odata, dev_idata, dev_d);
+	cudaDeviceSynchronize();
+
+	cudaMemcpy(odata, dev_odata, n_size, cudaMemcpyDeviceToHost);
+}
+
+void sort(int n, int* odata, const int* idata){
+	int numBits = 8 * sizeof(int); //TODO: probably only need log size of largest int in the array
+	int n_size = n*sizeof(int);
+	
+	int* hst_idata = (int*)malloc(n_size);
+	int* hst_odata = (int*)malloc(n_size);
+	int* tmp;
+	memcpy(hst_idata, idata, n_size);
+
+	cudaEvent_t start, stop;
+	cudaEventCreate(&start);
+	cudaEventCreate(&stop);
+	cudaEventRecord(start);
+
+	for (int i = 0; i < numBits; i++){
+		split(n, i, hst_odata, hst_idata);
+		cudaDeviceSynchronize();
+		memcpy(hst_idata, hst_odata, n_size);
+	}
+
+	cudaEventRecord(stop);
+	cudaEventSynchronize(stop);
+	float ms = 0;
+	cudaEventElapsedTime(&ms, start, stop);
+	printf("radix sort time (s): %f\n", ms / 1000.0);
+
+	memcpy(odata, hst_odata, n_size);
+}
+
+}
+}
diff --git a/stream_compaction/radix.h b/stream_compaction/radix.h
new file mode 100644
index 0000000..bea7a3f
--- /dev/null
+++ b/stream_compaction/radix.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <cstdio>
+#include <cstring>
+#include <cmath>
+
+#define MAXTHREADS 1024
+
+namespace StreamCompaction {
+namespace Radix {
+	void split(int n, int bit, int* odata, int* idata);
+	void sort(int n, int* odata, const int* idata);
+}
+}
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
new file mode 100644
index 0000000..a15b6ad
--- /dev/null
+++ b/stream_compaction/thrust.cu
@@ -0,0 +1,45 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/scan.h>
+#include "common.h"
+#include "thrust.h"
+
+namespace StreamCompaction {
+namespace Thrust {
+
+/**
+ * Performs prefix-sum (aka scan) on idata, storing the result into odata.
+ */
+void scan(int n, int *odata, const int *idata) {
+    // TODO use `thrust::exclusive_scan`
+    // example: for device_vectors dv_in and dv_out:
+    // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+	
+	// Initialize
+	thrust::host_vector<int> hst_v_in(idata,idata+n);
+	thrust::device_vector<int> v_in = hst_v_in;
+	thrust::device_vector<int> v_out(n);
+
+	cudaEvent_t start, stop;
+	cudaEventCreate(&start);
+	cudaEventCreate(&stop);
+
+	cudaEventRecord(start);
+
+	// Scan
+	thrust::exclusive_scan(v_in.begin(), v_in.end(), v_in.begin());
+
+	cudaEventRecord(stop);
+	cudaEventSynchronize(stop);
+	float ms = 0;
+	cudaEventElapsedTime(&ms, start, stop);
+	printf("thrust time (s): %f\n",ms/1000.0);
+
+	thrust::host_vector<int> hst_v_out = v_in;
+	memcpy(odata,hst_v_out.data(),n*sizeof(int));
+}
+
+}
+}
diff --git a/stream_compaction/thrust.h b/stream_compaction/thrust.h
new file mode 100644
index 0000000..06707f3
--- /dev/null
+++ b/stream_compaction/thrust.h
@@ -0,0 +1,7 @@
+#pragma once
+
+namespace StreamCompaction {
+namespace Thrust {
+    void scan(int n, int *odata, const int *idata);
+}
+}