diff --git a/PROJ1_WIN/565Pathtracer.sln b/PROJ1_WIN/565Pathtracer.sln
index 5cc5973..700ca36 100755
--- a/PROJ1_WIN/565Pathtracer.sln
+++ b/PROJ1_WIN/565Pathtracer.sln
@@ -1,6 +1,6 @@
 ﻿
-Microsoft Visual Studio Solution File, Format Version 11.00
-# Visual Studio 2010
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2012
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "565Pathtracer", "565Pathtracer\565Pathtracer.vcxproj", "{FF21CA49-522E-4E86-B508-EE515B248FC4}"
 EndProject
 Global
@@ -9,10 +9,14 @@ Global
 		Debug (v4.0)|x64 = Debug (v4.0)|x64
 		Debug (v5.5)|Win32 = Debug (v5.5)|Win32
 		Debug (v5.5)|x64 = Debug (v5.5)|x64
+		Debug|Win32 = Debug|Win32
+		Debug|x64 = Debug|x64
 		Release (v4.0)|Win32 = Release (v4.0)|Win32
 		Release (v4.0)|x64 = Release (v4.0)|x64
 		Release (v5.5)|Win32 = Release (v5.5)|Win32
 		Release (v5.5)|x64 = Release (v5.5)|x64
+		Release|Win32 = Release|Win32
+		Release|x64 = Release|x64
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
 		{FF21CA49-522E-4E86-B508-EE515B248FC4}.Debug (v4.0)|Win32.ActiveCfg = Debug|Win32
@@ -21,12 +25,18 @@ Global
 		{FF21CA49-522E-4E86-B508-EE515B248FC4}.Debug (v5.5)|Win32.ActiveCfg = Debug (v5.5)|Win32
 		{FF21CA49-522E-4E86-B508-EE515B248FC4}.Debug (v5.5)|Win32.Build.0 = Debug (v5.5)|Win32
 		{FF21CA49-522E-4E86-B508-EE515B248FC4}.Debug (v5.5)|x64.ActiveCfg = Debug (v5.5)|Win32
+		{FF21CA49-522E-4E86-B508-EE515B248FC4}.Debug|Win32.ActiveCfg = Debug|Win32
+		{FF21CA49-522E-4E86-B508-EE515B248FC4}.Debug|Win32.Build.0 = Debug|Win32
+		{FF21CA49-522E-4E86-B508-EE515B248FC4}.Debug|x64.ActiveCfg = Debug|Win32
 		{FF21CA49-522E-4E86-B508-EE515B248FC4}.Release (v4.0)|Win32.ActiveCfg = Release|Win32
 		{FF21CA49-522E-4E86-B508-EE515B248FC4}.Release (v4.0)|Win32.Build.0 = Release|Win32
 		{FF21CA49-522E-4E86-B508-EE515B248FC4}.Release (v4.0)|x64.ActiveCfg = Release|Win32
 		{FF21CA49-522E-4E86-B508-EE515B248FC4}.Release (v5.5)|Win32.ActiveCfg = Release (v5.5)|Win32
 		{FF21CA49-522E-4E86-B508-EE515B248FC4}.Release (v5.5)|Win32.Build.0 = Release (v5.5)|Win32
 		{FF21CA49-522E-4E86-B508-EE515B248FC4}.Release (v5.5)|x64.ActiveCfg = Release (v5.5)|Win32
+		{FF21CA49-522E-4E86-B508-EE515B248FC4}.Release|Win32.ActiveCfg = Release|Win32
+		{FF21CA49-522E-4E86-B508-EE515B248FC4}.Release|Win32.Build.0 = Release|Win32
+		{FF21CA49-522E-4E86-B508-EE515B248FC4}.Release|x64.ActiveCfg = Release|Win32
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
diff --git a/PROJ1_WIN/565Pathtracer/565Pathtracer.vcxproj b/PROJ1_WIN/565Pathtracer/565Pathtracer.vcxproj
index 4515c57..20bcc20 100755
--- a/PROJ1_WIN/565Pathtracer/565Pathtracer.vcxproj
+++ b/PROJ1_WIN/565Pathtracer/565Pathtracer.vcxproj
@@ -19,6 +19,7 @@
     </ProjectConfiguration>
   </ItemGroup>
   <ItemGroup>
+    <ClInclude Include="..\..\src\cudaAlgorithms.h" />
     <ClInclude Include="..\..\src\cudaMat4.h" />
     <ClInclude Include="..\..\src\glslUtility.h" />
     <ClInclude Include="..\..\src\image.h" />
@@ -42,6 +43,7 @@
     <ClCompile Include="..\..\src\utilities.cpp" />
   </ItemGroup>
   <ItemGroup>
+    <CudaCompile Include="..\..\src\cudaAlgorithms.cu" />
     <CudaCompile Include="..\..\src\raytraceKernel.cu" />
   </ItemGroup>
   <PropertyGroup Label="Globals">
@@ -55,27 +57,31 @@
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>true</UseDebugLibraries>
     <CharacterSet>Unicode</CharacterSet>
+    <PlatformToolset>v110</PlatformToolset>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug (v5.5)|Win32'" Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>true</UseDebugLibraries>
     <CharacterSet>Unicode</CharacterSet>
+    <PlatformToolset>v110</PlatformToolset>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
     <WholeProgramOptimization>true</WholeProgramOptimization>
     <CharacterSet>Unicode</CharacterSet>
+    <PlatformToolset>v110</PlatformToolset>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release (v5.5)|Win32'" Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
     <WholeProgramOptimization>true</WholeProgramOptimization>
     <CharacterSet>Unicode</CharacterSet>
+    <PlatformToolset>v110</PlatformToolset>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="..\Build\CUDA 4.0.props" />
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 5.5.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
@@ -132,20 +138,23 @@
       <WarningLevel>Level3</WarningLevel>
       <Optimization>Disabled</Optimization>
       <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include;C:\ProgramData\NVIDIA Corporation\CUDA Samples\v5.5\common\inc;../shared/glew/include;../shared/freeglut/include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include;C:\ProgramData\NVIDIA Corporation\CUDA Samples\v5.5\common\inc;../shared/glew/include;../shared/freeglut/include;C:\Program Files\NVIDIA Corporation\NvToolsExt\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
     </ClCompile>
     <Link>
       <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalLibraryDirectories>../shared/glew/lib;../shared/freeglut/lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
-      <AdditionalDependencies>cudart.lib; glew32.lib;glu32.lib;opengl32.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>C:\Program Files\NVIDIA Corporation\NvToolsExt\lib\x64;../shared/glew/lib;../shared/freeglut/lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <AdditionalDependencies>cudart.lib;glew32.lib;glu32.lib;opengl32.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;nvToolsExt64_1.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
       <IgnoreSpecificDefaultLibraries>
       </IgnoreSpecificDefaultLibraries>
     </Link>
     <CudaCompile>
       <CompileOut>$(ProjectDir)$(Platform)/$(Configuration)/%(Filename)%(Extension).obj</CompileOut>
-      <Include>C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include;C:/ProgramData/NVIDIA Corporation/CUDA Samples/v5.5/common/inc;../shared/glew/includes;../shared/freeglut/includes</Include>
+      <Include>C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include;C:/ProgramData/NVIDIA Corporation/CUDA Samples/v5.5/common/inc;../shared/glew/includes;../shared/freeglut/includes;C:\Program Files\NVIDIA Corporation\NvToolsExt\include</Include>
+      <GPUDebugInfo>true</GPUDebugInfo>
+      <GenerateLineInfo>true</GenerateLineInfo>
+      <CodeGeneration>compute_20,sm_20</CodeGeneration>
     </CudaCompile>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
@@ -196,10 +205,11 @@
     <CudaCompile>
       <CompileOut>$(ProjectDir)$(Platform)/$(Configuration)/%(Filename)%(Extension).obj</CompileOut>
       <Include>C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include;C:/ProgramData/NVIDIA Corporation/CUDA Samples/v5.5/common/inc;../shared/glew/includes;../shared/freeglut/includes</Include>
+      <CodeGeneration>compute_20,sm_20</CodeGeneration>
     </CudaCompile>
   </ItemDefinitionGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="..\Build\CUDA 4.0.targets" />
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 5.5.targets" />
   </ImportGroup>
-</Project>
+</Project>
\ No newline at end of file
diff --git a/PROJ1_WIN/565Pathtracer/565Pathtracer.vcxproj.filters b/PROJ1_WIN/565Pathtracer/565Pathtracer.vcxproj.filters
index d49ad9c..8a385f7 100755
--- a/PROJ1_WIN/565Pathtracer/565Pathtracer.vcxproj.filters
+++ b/PROJ1_WIN/565Pathtracer/565Pathtracer.vcxproj.filters
@@ -30,9 +30,11 @@
     <ClInclude Include="..\..\src\stb_image\stb_image.h">
       <Filter>stb_image</Filter>
     </ClInclude>
+    <ClInclude Include="..\..\src\cudaAlgorithms.h" />
   </ItemGroup>
   <ItemGroup>
     <CudaCompile Include="..\..\src\raytraceKernel.cu" />
+    <CudaCompile Include="..\..\src\cudaAlgorithms.cu" />
   </ItemGroup>
   <ItemGroup>
     <Filter Include="stb_image">
diff --git a/PROJ1_WIN/565Pathtracer/565Pathtracer.vcxproj.user b/PROJ1_WIN/565Pathtracer/565Pathtracer.vcxproj.user
index dfd9f6c..57ad496 100755
--- a/PROJ1_WIN/565Pathtracer/565Pathtracer.vcxproj.user
+++ b/PROJ1_WIN/565Pathtracer/565Pathtracer.vcxproj.user
@@ -5,7 +5,7 @@
     <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug (v5.5)|Win32'">
-    <LocalDebuggerCommandArguments>scene=../../scenes/sampleScene.txt</LocalDebuggerCommandArguments>
+    <LocalDebuggerCommandArguments>scene=../../scenes/hallofmirrors.txt</LocalDebuggerCommandArguments>
     <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug (v4.0)|Win32'">
@@ -13,7 +13,7 @@
     <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release (v5.5)|Win32'">
-    <LocalDebuggerCommandArguments>scene=../../scenes/sampleScene.txt</LocalDebuggerCommandArguments>
+    <LocalDebuggerCommandArguments>scene=../../scenes/hallofmirrors.txt</LocalDebuggerCommandArguments>
     <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
   </PropertyGroup>
-</Project>
+</Project>
\ No newline at end of file
diff --git a/README.md b/README.md
index 1e36dc5..18c4225 100755
--- a/README.md
+++ b/README.md
@@ -1,24 +1,34 @@
 -------------------------------------------------------------------------------
-CIS565: Project 2: CUDA Pathtracer
--------------------------------------------------------------------------------
-Fall 2013
--------------------------------------------------------------------------------
-Due Wednesday, 10/02/13
--------------------------------------------------------------------------------
-
+CIS565 Fall 2013: Project 2: CUDA Pathtracer
 -------------------------------------------------------------------------------
 NOTE:
 -------------------------------------------------------------------------------
-This project requires an NVIDIA graphics card with CUDA capability! Any card after the Geforce 8xxx series will work. If you do not have an NVIDIA graphics card in the machine you are working on, feel free to use any machine in the SIG Lab or in Moore100 labs. All machines in the SIG Lab and Moore100 are equipped with CUDA capable NVIDIA graphics cards. If this too proves to be a problem, please contact Patrick or Liam as soon as possible.
+This project requires an NVIDIA graphics card with CUDA compute 2.0 capability! 
+
 
 -------------------------------------------------------------------------------
 INTRODUCTION:
 -------------------------------------------------------------------------------
-In this project, you will extend your raytracer from Project 1 into a full CUDA based global illumination pathtracer. 
+This is a basic path tracing engine written in CUDA. 
 
-For this project, you may either choose to continue working off of your codebase from Project 1, or you may choose to use the included basecode in this repository. The basecode for Project 2 is the same as the basecode for Project 1, but with some missing components you will need filled in, such as the intersection testing and camera raycasting methods. 
+It is capable of simulating a diverse set of materials including matte diffuse surfaces, 
+metalics, glossy smooth mirrors, translucent and transparent refractive materials, 
+glass and speckled tinted glass.
 
-How you choose to extend your raytracer into a pathtracer is a fairly open-ended problem; the supplied basecode is meant to serve as one possible set of guidelines for doing so, but you may choose any approach you want in your actual implementation, including completely scrapping the provided basecode in favor of your own from-scratch solution.
+![This scene contains elements of every feature implemented](/renders/fuzzyglass_fresnel.bmp "Speckled Tinted Glass")
+
+-------------------------------------------------------------------------------
+Features:
+-------------------------------------------------------------------------------
+
+* Full global illumination (including soft shadows, color bleeding, etc.) by pathtracing rays through the scene. 
+* Supersampled antialiasing
+* Parallelization by ray instead of by pixel
+* Perfect specular reflection
+* Scattered specular reflection and transmission (fuzzy glass/mirrors)
+* Fresnel-based reflection/refraction (i.e. glass)
+* Stream compaction for optimizing high bounce counts in open scenes
+* Optional Global Lighting sources for faster convergence
 
 -------------------------------------------------------------------------------
 CONTENTS:
@@ -40,108 +50,163 @@ The Project2 root directory contains the following subdirectories:
 The projects build and run exactly the same way as in Project0 and Project1.
 
 -------------------------------------------------------------------------------
-REQUIREMENTS:
+Interactive Controls
 -------------------------------------------------------------------------------
-In this project, you are given code for:
+The engine was designed so that many features could modified at runtime to allow easy exploration of the effects of various parameters. In addition, several debug modes were implemented that graphically display additional information about the scene. These options to result in more complex kernels that have a negative impact on performance. I preferred the flexibility to quickly experiment for this project, but in the path tracer I will be redesigning the kernel structure from the ground up with performance in mind.
 
-* All of the basecode from Project 1, plus:
-* Intersection testing code for spheres and cubes
-* Code for raycasting from the camera
+Here is a complete list of the keypress commands you can use at runtime.
 
-You will need to implement the following features. A number of these required features you may have already implemented in Project 1. If you have, you are ahead of the curve and have less work to do! 
+Keypress | Function
+--- | ---
+A | Toggles Anti-Aliasing
+S | Toggles Stream Compaction
+F | Toggles Frame Filtering
+G | Toggles Harsh Global Shadows
+f | Clears frame filter
+= | Increase trace depth
+- | Decrease trace dept
+ESC | Exit
+1 | Pathtracing Render Mode
+2 | Ray Coverage Debug Mode
+3 | Trace Depth Debug Mode
+4 | First Hit Debug Mode
+5 | Normals Debug Mode
 
-* Full global illumination (including soft shadows, color bleeding, etc.) by pathtracing rays through the scene. 
-* Properly accumulating emittance and colors to generate a final image
-* Supersampled antialiasing
-* Parallelization by ray instead of by pixel via stream compaction
-* Perfect specular reflection
-
-You are also required to implement at least two of the following features. Some of these features you may have already implemented in Project 1. If you have, you may NOT resubmit those features and instead must pick two new ones to implement.
-
-* Additional BRDF models, such as Cook-Torrance, Ward, etc. Each BRDF model may count as a separate feature. 
-* Texture mapping 
-* Bump mapping
-* Translational motion blur
-* Fresnel-based Refraction, i.e. glass
-* OBJ Mesh loading and rendering without KD-Tree
-* Interactive camera
-* Integrate an existing stackless KD-Tree library, such as CUKD (https://github.com/unvirtual/cukd)
-* Depth of field
+-------------------------------------------------------------------------------
+Debug Modes
+-------------------------------------------------------------------------------
+Ray Coverage Debug Mode should be all white if every ray is mapped correctly to a pixel.
 
-Alternatively, implementing just one of the following features can satisfy the "pick two" feature requirement, since these are correspondingly more difficult problems:
+Trace Depth Debug Mode shows how many bounces contributed to each pixel. White is the maximum number, black is 1 bounce.
+![Debug Mode](/screenshots/tracedepth_debug.bmp "Trace Debug Mode")
 
-* Physically based subsurface scattering and transmission
-* Implement and integrate your own stackless KD-Tree from scratch. 
-* Displacement mapping
-* Deformational motion blur
+First Hit Debug Mode just shows the raw diffuse color of the first object hit for collision verification and previsualization.
 
-As yet another alternative, if you have a feature or features you really want to implement that are not on this list, let us know, and we'll probably say yes!
+Normal debug mode colors the normals of the first impacted surface for each ray. Pure RGB colors are axis aligned.
+(i.e. Red pixels have normals along the x-axis)
+![Debug Mode](/screenshots/normal_debug.bmp "Normal Debug Mode")
 
 -------------------------------------------------------------------------------
-NOTES ON GLM:
+Global Lighting
 -------------------------------------------------------------------------------
-This project uses GLM, the GL Math library, for linear algebra. You need to know two important points on how GLM is used in this project:
+Early experiments with the pathtracer in closed environments showed that the final renderings could be quite dark.
+![Rendering](/renders/hallofmirrors5000.0.bmp "Dark Rendering")
 
-* In this project, indices in GLM vectors (such as vec3, vec4), are accessed via swizzling. So, instead of v[0], v.x is used, and instead of v[1], v.y is used, and so on and so forth.
-* GLM Matrix operations work fine on NVIDIA Fermi cards and later, but pre-Fermi cards do not play nice with GLM matrices. As such, in this project, GLM matrices are replaced with a custom matrix struct, called a cudaMat4, found in cudaMat4.h. A custom function for multiplying glm::vec4s and cudaMat4s is provided as multiplyMV() in intersections.h.
+My first attempt to correct this was increasing the lighting intensity, which caused saturation and speckling around light emitters.
+![Rendering](/renders/hallofmirrors.refractionbug.bmp "Saturation")
 
--------------------------------------------------------------------------------
-README
--------------------------------------------------------------------------------
-All students must replace or augment the contents of this Readme.md in a clear 
-manner with the following:
+What do you do when you don't have enough light? Add a sun!
+Here is a similar scene with a fresnel glass roof and direct overhead lighting. You can make out the blue tint of the sky through the ceiling but still see some lights reflected in it.
+
+![Rendering](/renders/greentintedglass.0.bmp "Skylight!")
+
+Not only did this dramatically improve the appearance and brightness of the scene, but because more rays are now hitting a "light source" the image converges much faster, especially in outdoor environments like the sundial images below.
+The implementation makes changing from a bright midday sun to a peaceful moonlit night as easy as changing the material id of the global light.
+The following scenes are all rendered with ONLY global lighting. Note how the shadow on the sundial tracks the light. All this works implicitly by moving the sun in the sky.
+Also note the fresnel reflections off the simulated water around the island.
+
+![Rendering](/renders/sundial1.bmp "Sundial") | ![Rendering](/renders/sundial1_moonlight.0.bmp "Sundial")
+![Rendering](/renders/sundial2.0.bmp "Sundial") | ![Rendering](/renders/sundial2_moonlight.0.bmp "Sundial")
+![Rendering](/renders/sundial3.0.bmp "Sundial") | ![Rendering](/renders/sundial3_moonlight.0.bmp "Sundial")
+
+Note how the global light in this scene behaves just as any other pathtraced emitter, creating interesting effects like caustics below the lenses.
 
-* A brief description of the project and the specific features you implemented.
-* At least one screenshot of your project running.
-* A 30 second or longer video of your project running.  To create the video you
-  can use http://www.microsoft.com/expression/products/Encoder4_Overview.aspx 
-* A performance evaluation (described in detail below).
+![Rendering](/renders/test.0.bmp "Caustics") 
 
 -------------------------------------------------------------------------------
 PERFORMANCE EVALUATION
 -------------------------------------------------------------------------------
-The performance evaluation is where you will investigate how to make your CUDA
-programs more efficient using the skills you've learned in class. You must have
-performed at least one experiment on your code to investigate the positive or
-negative effects on performance. 
+I ran a sweep of maximum trace depth and compared the render time of each frame with and without stream compaction.
+I also found it informative to plot the average number of bounces for each ray on the same plot. The test was run on both the hall of mirrors with glass ceiling and the sundial image listed above.
 
-One such experiment would be to investigate the performance increase involved 
-with adding a spatial data-structure to your scene data.
+Data contains an anomaly where after about a trace depth of 70-80 my random number generator started to fail resulting in an increase in average bounces but less branch divergence, affecting performance of both kernels.
+![Rendering](/testdata/island.png "Open Environment Sundial") 
+![Rendering](/testdata/HallOfMirrors.png "Closed Environment") 
 
-Another idea could be looking at the change in timing between various block
-sizes.
+Note that in both test cases, the average runtime of the stream compaction added a good deal of overhead that grew faster with the trace depth than the uncompacted ray pool.
+However, once the trace depth exceeds the average number of bounces, stream compaction imediately levels out while the no compaction runtime continues to increase.
 
-A good metric to track would be number of rays per second, or frames per 
-second, or number of objects displayable at 60fps.
+Depending on the environment, the crossover point varies considerably.
 
-We encourage you to get creative with your tweaks. Consider places in your code
-that could be considered bottlenecks and try to improve them. 
+Another interesting effect is for the Hall of Mirrors data, the kernel without stream compaction started running FASTER after a trace depth of 65.
+This is likely because at that point branch divergence is minimized because nearly every ray has been retired, so entire warps can simply retire themselves in 3 instructions.
 
-Each student should provide no more than a one page summary of their
-optimizations along with tables and or graphs to visually explain any
-performance differences.
 
 -------------------------------------------------------------------------------
-THIRD PARTY CODE POLICY
+NOTES ON GLM:
 -------------------------------------------------------------------------------
-* Use of any third-party code must be approved by asking on the Google group.  If it is approved, all students are welcome to use it.  Generally, we approve use of third-party code that is not a core part of the project.  For example, for the ray tracer, we would approve using a third-party library for loading models, but would not approve copying and pasting a CUDA function for doing refraction.
-* Third-party code must be credited in README.md.
-* Using third-party code without its approval, including using another student's code, is an academic integrity violation, and will result in you receiving an F for the semester.
+This project uses GLM, the GL Math library, for linear algebra. You need to know two important points on how GLM is used in this project:
 
--------------------------------------------------------------------------------
-SELF-GRADING
--------------------------------------------------------------------------------
-* On the submission date, email your grade, on a scale of 0 to 100, to Liam, liamboone+cis565@gmail.com, with a one paragraph explanation.  Be concise and realistic.  Recall that we reserve 30 points as a sanity check to adjust your grade.  Your actual grade will be (0.7 * your grade) + (0.3 * our grade).  We hope to only use this in extreme cases when your grade does not realistically reflect your work - it is either too high or too low.  In most cases, we plan to give you the exact grade you suggest.
-* Projects are not weighted evenly, e.g., Project 0 doesn't count as much as the path tracer.  We will determine the weighting at the end of the semester based on the size of each project.
+* In this project, indices in GLM vectors (such as vec3, vec4), are accessed via swizzling. So, instead of v[0], v.x is used, and instead of v[1], v.y is used, and so on and so forth.
+* GLM Matrix operations work fine on NVIDIA Fermi cards and later, but pre-Fermi cards do not play nice with GLM matrices. As such, in this project, GLM matrices are replaced with a custom matrix struct, called a cudaMat4, found in cudaMat4.h. A custom function for multiplying glm::vec4s and cudaMat4s is provided as multiplyMV() in intersections.h.
 
--------------------------------------------------------------------------------
-SUBMISSION
--------------------------------------------------------------------------------
-As with the previous project, you should fork this project and work inside of your fork. Upon completion, commit your finished project back to your fork, and make a pull request to the master repository.
-You should include a README.md file in the root directory detailing the following
 
-* A brief description of the project and specific features you implemented
-* At least one screenshot of your project running, and at least one screenshot of the final rendered output of your pathtracer
-* Instructions for building and running your project if they differ from the base code
-* A link to your blog post detailing the project
-* A list of all third-party code used
+-------------------------------------------------------------------------------
+THIRD PARTY CODE
+-------------------------------------------------------------------------------
+My implementation of parallel exclusive scan in CUDA was greatly influenced by this GPUGems article: http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html 
+I had to rewrite it all myself and implement the arbitrary length array code, but the bulk of the code is very similar.
+
+
+
+-------------------------------------------------------------------------------
+TAKUAscene FORMAT:
+-------------------------------------------------------------------------------
+This project uses a custom scene description format, called TAKUAscene.
+TAKUAscene files are flat text files that describe all geometry, materials,
+lights, cameras, render settings, and animation frames inside of the scene.
+Items in the format are delimited by new lines, and comments can be added at
+the end of each line preceded with a double-slash.
+
+Materials are defined in the following fashion:
+
+* MATERIAL (material ID)								//material header
+* RGB (float r) (float g) (float b)					//diffuse color (light source color for global light)
+* SPECX (float specx)									//specular exponent > 0. Higher values will result in more clear reflectiong/refractions. If set to -1, this will turn the material into a global light source
+* SPECRGB (float r) (float g) (float b)				//specular color (background/sky color for global light)
+* REFL (bool refl)									//reflectivity component. number between 0 and 1 (0==purely diffuse, 1==purely reflective)
+* REFR (bool refr)									//refractivity component. number between 0 and 1 (0==purely diffuse, 1==purely refractive)
+* REFRIOR (float ior)									//index of refraction
+  for Fresnel effects
+* SCATTER (float scatter)								//scatter flag, 0 for
+  no, 1 for yes
+* ABSCOEFF (float r) (float b) (float g)				//absorption
+  coefficients (direct light shading for global light source) 
+* RSCTCOEFF (float rsctcoeff)							//reduced scattering
+  coefficient
+* EMITTANCE (float emittance)							//the emittance of the
+  material. Anything >0 makes the material a light source. Going above 1 will make image saturate some pixels
+
+Cameras are defined in the following fashion:
+
+* CAMERA 												//camera header
+* RES (float x) (float y)								//resolution
+* FOVY (float fovy)										//vertical field of
+  view half-angle. the horizonal angle is calculated from this and the
+  reslution
+* ITERATIONS (float interations)							//how many
+  iterations to refine the image, only relevant for supersampled antialiasing,
+  depth of field, area lights, and other distributed raytracing applications
+* FILE (string filename)									//file to output
+  render to upon completion
+* frame (frame number)									//start of a frame
+* EYE (float x) (float y) (float z)						//camera's position in
+  worldspace
+* VIEW (float x) (float y) (float z)						//camera's view
+  direction
+* UP (float x) (float y) (float z)						//camera's up vector
+
+Objects are defined in the following fashion:
+* OBJECT (object ID)										//object header
+* (cube OR sphere OR mesh)								//type of object, can
+  be either "cube", "sphere", or "mesh". Note that cubes and spheres are unit
+  sized and centered at the origin.
+* material (material ID)									//material to
+  assign this object
+* frame (frame number)									//start of a frame
+* TRANS (float transx) (float transy) (float transz)		//translation
+* ROTAT (float rotationx) (float rotationy) (float rotationz)		//rotation
+* SCALE (float scalex) (float scaley) (float scalez)		//scale
+
+An example TAKUAscene file setting up two frames inside of a Cornell Box can be
+found in the scenes/ directory.
diff --git a/renders/fuzzyglass.bmp b/renders/fuzzyglass.bmp
new file mode 100644
index 0000000..ebe27df
Binary files /dev/null and b/renders/fuzzyglass.bmp differ
diff --git a/renders/fuzzyglass_fresnel.bmp b/renders/fuzzyglass_fresnel.bmp
new file mode 100644
index 0000000..472e896
Binary files /dev/null and b/renders/fuzzyglass_fresnel.bmp differ
diff --git a/renders/glassceiling.bmp b/renders/glassceiling.bmp
new file mode 100644
index 0000000..5924de9
Binary files /dev/null and b/renders/glassceiling.bmp differ
diff --git a/renders/greentintedglass.0.bmp b/renders/greentintedglass.0.bmp
new file mode 100644
index 0000000..e18fb58
Binary files /dev/null and b/renders/greentintedglass.0.bmp differ
diff --git a/renders/hallofmirrors.0.bmp b/renders/hallofmirrors.0.bmp
new file mode 100644
index 0000000..5bf59f5
Binary files /dev/null and b/renders/hallofmirrors.0.bmp differ
diff --git a/renders/hallofmirrors.badrefraction.bmp b/renders/hallofmirrors.badrefraction.bmp
new file mode 100644
index 0000000..cb85a82
Binary files /dev/null and b/renders/hallofmirrors.badrefraction.bmp differ
diff --git a/renders/hallofmirrors.bmp b/renders/hallofmirrors.bmp
new file mode 100644
index 0000000..7e860d8
Binary files /dev/null and b/renders/hallofmirrors.bmp differ
diff --git a/renders/hallofmirrors.refractionbug.bmp b/renders/hallofmirrors.refractionbug.bmp
new file mode 100644
index 0000000..8d2c38a
Binary files /dev/null and b/renders/hallofmirrors.refractionbug.bmp differ
diff --git a/renders/hallofmirrors5000.0.bmp b/renders/hallofmirrors5000.0.bmp
new file mode 100644
index 0000000..6d9ce2d
Binary files /dev/null and b/renders/hallofmirrors5000.0.bmp differ
diff --git a/renders/stonehenge.0.bmp b/renders/stonehenge.0.bmp
new file mode 100644
index 0000000..446c907
Binary files /dev/null and b/renders/stonehenge.0.bmp differ
diff --git a/renders/sundial1.bmp b/renders/sundial1.bmp
new file mode 100644
index 0000000..562ad7b
Binary files /dev/null and b/renders/sundial1.bmp differ
diff --git a/renders/sundial1_moonlight.0.bmp b/renders/sundial1_moonlight.0.bmp
new file mode 100644
index 0000000..08eb7f7
Binary files /dev/null and b/renders/sundial1_moonlight.0.bmp differ
diff --git a/renders/sundial2.0.bmp b/renders/sundial2.0.bmp
new file mode 100644
index 0000000..fabccb4
Binary files /dev/null and b/renders/sundial2.0.bmp differ
diff --git a/renders/sundial2_moonlight.0.bmp b/renders/sundial2_moonlight.0.bmp
new file mode 100644
index 0000000..d373022
Binary files /dev/null and b/renders/sundial2_moonlight.0.bmp differ
diff --git a/renders/sundial3.0.bmp b/renders/sundial3.0.bmp
new file mode 100644
index 0000000..24feff1
Binary files /dev/null and b/renders/sundial3.0.bmp differ
diff --git a/renders/sundial3_moonlight.0.bmp b/renders/sundial3_moonlight.0.bmp
new file mode 100644
index 0000000..6585b47
Binary files /dev/null and b/renders/sundial3_moonlight.0.bmp differ
diff --git a/renders/test.0.bmp b/renders/test.0.bmp
index 9319138..c3d7028 100644
Binary files a/renders/test.0.bmp and b/renders/test.0.bmp differ
diff --git a/scenes/cube.obj b/scenes/cube.obj
new file mode 100644
index 0000000..f7cd3af
--- /dev/null
+++ b/scenes/cube.obj
@@ -0,0 +1,28 @@
+v -0.5 -0.5 -0.5
+v -0.5 -0.5 0.5
+v -0.5 0.5 0.5
+v -0.5 0.5 -0.5
+v 0.5 0.5 -0.5
+v 0.5 0.5 0.5
+v 0.5 -0.5 0.5
+v 0.5 -0.5 -0.5
+
+vn 1 0 0
+vn 0 1 0
+vn 0 0 1
+vn -1 0 0
+vn 0 -1 0
+vn 0 0 -1
+
+f 1//4 2//4 4//4
+f 4//4 2//4 3//4
+f 8//6 1//6 5//6
+f 5//6 1//6 4//6
+f 4//2 3//2 5//2
+f 5//2 3//2 6//2
+f 5//1 6//1 8//1
+f 8//1 6//1 7//1
+f 2//3 7//3 3//3
+f 3//3 7//3 6//3
+f 8//5 7//5 1//5
+f 1//5 7//5 2//5
\ No newline at end of file
diff --git a/scenes/hallofmirrors.txt b/scenes/hallofmirrors.txt
new file mode 100644
index 0000000..a7c823a
--- /dev/null
+++ b/scenes/hallofmirrors.txt
@@ -0,0 +1,290 @@
+MATERIAL 0				//white diffuse
+RGB         1 1 1       
+SPECEX      0      
+SPECRGB     1 1 1      
+REFL        0       
+REFR        0        
+REFRIOR     0       
+SCATTER     0        
+ABSCOEFF    0 0 0      
+RSCTCOEFF   0
+EMITTANCE   0
+
+MATERIAL 1 				//red diffuse
+RGB         .63 .06 .04       
+SPECEX      0      
+SPECRGB     1 1 1      
+REFL        0       
+REFR        0        
+REFRIOR     0       
+SCATTER     0        
+ABSCOEFF    0 0 0      
+RSCTCOEFF   0
+EMITTANCE   0
+
+MATERIAL 2 				//green diffuse
+RGB         .15 .48 .09      
+SPECEX      0      
+SPECRGB     1 1 1      
+REFL        0       
+REFR        0        
+REFRIOR     0       
+SCATTER     0        
+ABSCOEFF    0 0 0      
+RSCTCOEFF   0
+EMITTANCE   0
+
+MATERIAL 3 				//red glossy
+RGB         .63 .06 .04      
+SPECEX      5      
+SPECRGB     1 1 1       
+REFL        0       
+REFR        0        
+REFRIOR     2       
+SCATTER     0        
+ABSCOEFF    0 0 0      
+RSCTCOEFF   0
+EMITTANCE   0
+
+MATERIAL 4 				//white glossy
+RGB         1 1 1     
+SPECEX      10      
+SPECRGB     1 1 1      
+REFL        0       
+REFR        0        
+REFRIOR     2      
+SCATTER     0        
+ABSCOEFF    0 0 0      
+RSCTCOEFF   0
+EMITTANCE   0
+
+MATERIAL 5 				//glass
+RGB         1 1 1    
+SPECEX      1000000000000000      
+SPECRGB     1 1 1      
+REFL        1       
+REFR        1        
+REFRIOR     1.5
+SCATTER     0        
+ABSCOEFF    .02 0 0.02  
+RSCTCOEFF   13
+EMITTANCE   0
+
+MATERIAL 6 				//green glossy
+RGB         .15 .48 .09      
+SPECEX      10      
+SPECRGB     1 1 1     
+REFL        0       
+REFR        0        
+REFRIOR     2.6       
+SCATTER     0        
+ABSCOEFF    0 0 0      
+RSCTCOEFF   0
+EMITTANCE   0
+
+MATERIAL 7				//light
+RGB         1 1 1       
+SPECEX      0      
+SPECRGB     0 0 0       
+REFL        0       
+REFR        0        
+REFRIOR     0       
+SCATTER     0        
+ABSCOEFF    0 0 0      
+RSCTCOEFF   0
+EMITTANCE   1
+
+MATERIAL 8				//light
+RGB         0 1 0
+SPECEX      0      
+SPECRGB     0 0 0       
+REFL        0       
+REFR        0        
+REFRIOR     0       
+SCATTER     0        
+ABSCOEFF    0 0 0      
+RSCTCOEFF   0
+EMITTANCE   1
+
+
+MATERIAL 9				//light
+RGB         1 0 0
+SPECEX      0      
+SPECRGB     0 0 0       
+REFL        0       
+REFR        0        
+REFRIOR     0       
+SCATTER     0        
+ABSCOEFF    0 0 0      
+RSCTCOEFF   0
+EMITTANCE   1
+
+MATERIAL 10 				//bronze mirror
+RGB         0.8 0.5 0.195
+SPECEX      1000      
+SPECRGB     0.8 0.5 0.195     
+REFL        0.80       
+REFR        0        
+REFRIOR     0       
+SCATTER     0        
+ABSCOEFF    0 0 0
+RSCTCOEFF   0
+EMITTANCE   0
+
+MATERIAL 11 				//Pure mirror
+RGB         1 1 1
+SPECEX      1000000      
+SPECRGB     1 1 1      
+REFL        1       
+REFR        0        
+REFRIOR     0       
+SCATTER     0        
+ABSCOEFF    0 0 0
+RSCTCOEFF   0
+EMITTANCE   0
+
+
+MATERIAL 12 				//blueish mirror
+RGB         0.75 0.75 1
+SPECEX      1000000    
+SPECRGB     1 1 1      
+REFL        1       
+REFR        0        
+REFRIOR     0       
+SCATTER     0        
+ABSCOEFF    0 0 0
+RSCTCOEFF   0
+EMITTANCE   0
+
+MATERIAL 13 			//glass
+RGB         1 1 1    
+SPECEX      1e4     
+SPECRGB     1 1 1      
+REFL        1      
+REFR        1        
+REFRIOR     1.5
+SCATTER     0        
+ABSCOEFF    .0 0.02 0.02  
+RSCTCOEFF   13
+EMITTANCE   0
+
+MATERIAL 	14 					//Daylight
+RGB         1 0.77 0.075       	//Emitter Color
+SPECEX      -1					//Flag to use as global light material				
+SPECRGB     0.66 0.81 0.94      //Direct lighting background color (daylight sky blue)
+REFL        0 //Must be zero
+REFR        0 //Must be zero       
+REFRIOR     0       
+SCATTER     0        	
+ABSCOEFF    1 1 1		//Diffuse shading light color (Full white light)
+RSCTCOEFF   0
+EMITTANCE   1
+
+CAMERA
+RES         800 800
+FOVY        25
+ITERATIONS  600
+FILE        greentintedglass.bmp
+frame 0
+EYE         0 4 4.8
+VIEW        0 0 -1
+UP          0 1 0
+
+OBJECT 0
+cube
+material 0
+frame 0
+TRANS       0 0 0
+ROTAT       0 0 90
+SCALE       .01 10 10 
+
+OBJECT 1
+cube
+material 12
+frame 0
+TRANS       0 5 -5
+ROTAT       0 90 0
+SCALE       .01 10 10 
+
+OBJECT 2
+cube
+material 5
+frame 0
+TRANS       0 10 0
+ROTAT       0 0 90
+SCALE       .01 10 10
+
+OBJECT 3
+cube
+material 1
+frame 0
+TRANS       -5 5 0
+ROTAT       0 0 0
+SCALE       .01 10 10
+
+OBJECT 4
+cube
+material 2
+frame 0
+TRANS       5 5 0
+ROTAT       0 0 0
+SCALE       .01 10 10
+
+OBJECT 5
+sphere
+material 10
+frame 0
+TRANS       0 2 -1
+ROTAT       0 180 0
+SCALE       3 3 3
+
+OBJECT 6
+sphere
+material 11
+frame 0
+TRANS       2 5 1
+ROTAT       0 180 0
+SCALE       2.5 2.5 2.5
+
+OBJECT 7
+cube
+material 5
+frame 0
+TRANS       -2 5 -3
+ROTAT       0 0 0
+SCALE       2.5 2.5 2.5
+
+
+OBJECT 8
+sphere
+material 7 
+frame 0
+TRANS       0 9.5 2
+ROTAT       0 0 90
+SCALE       0.5 2 2
+
+OBJECT 9
+sphere
+material 9
+frame 0
+TRANS       -4 1.5 3
+ROTAT       0 0 90
+SCALE       1 1 1
+
+
+OBJECT 10
+cube
+material 11
+frame 0
+TRANS       0 5 5
+ROTAT       0 90 0
+SCALE       .01 10 10 
+
+//Global light
+OBJECT 11
+sphere
+material 14
+frame 0
+TRANS       0 5e4 0
+ROTAT       0 0 0
+SCALE       1e4 1e4 1e4
diff --git a/scenes/refractedmirrors.txt b/scenes/refractedmirrors.txt
new file mode 100644
index 0000000..e0a0b67
--- /dev/null
+++ b/scenes/refractedmirrors.txt
@@ -0,0 +1,269 @@
+MATERIAL 0				//white diffuse
+RGB         1 1 1       
+SPECEX      0      
+SPECRGB     1 1 1      
+REFL        0       
+REFR        0        
+REFRIOR     0       
+SCATTER     0        
+ABSCOEFF    0 0 0      
+RSCTCOEFF   0
+EMITTANCE   0
+
+MATERIAL 1 				//red diffuse
+RGB         .63 .06 .04       
+SPECEX      0      
+SPECRGB     1 1 1      
+REFL        0       
+REFR        0        
+REFRIOR     0       
+SCATTER     0        
+ABSCOEFF    0 0 0      
+RSCTCOEFF   0
+EMITTANCE   0
+
+MATERIAL 2 				//green diffuse
+RGB         .15 .48 .09      
+SPECEX      0      
+SPECRGB     1 1 1      
+REFL        0       
+REFR        0        
+REFRIOR     0       
+SCATTER     0        
+ABSCOEFF    0 0 0      
+RSCTCOEFF   0
+EMITTANCE   0
+
+MATERIAL 3 				//red glossy
+RGB         .63 .06 .04      
+SPECEX      5      
+SPECRGB     1 1 1       
+REFL        0       
+REFR        0        
+REFRIOR     2       
+SCATTER     0        
+ABSCOEFF    0 0 0      
+RSCTCOEFF   0
+EMITTANCE   0
+
+MATERIAL 4 				//white glossy
+RGB         1 1 1     
+SPECEX      10      
+SPECRGB     1 1 1      
+REFL        0       
+REFR        0        
+REFRIOR     2      
+SCATTER     0        
+ABSCOEFF    0 0 0      
+RSCTCOEFF   0
+EMITTANCE   0
+
+MATERIAL 5 				//glass
+RGB         1 1 1    
+SPECEX      0      
+SPECRGB     1 1 1      
+REFL        0       
+REFR        1        
+REFRIOR     1.5      
+SCATTER     0        
+ABSCOEFF    .02 5.1 5.7      
+RSCTCOEFF   13
+EMITTANCE   0
+
+MATERIAL 6 				//green glossy
+RGB         .15 .48 .09      
+SPECEX      10      
+SPECRGB     1 1 1     
+REFL        0       
+REFR        0        
+REFRIOR     2.6       
+SCATTER     0        
+ABSCOEFF    0 0 0      
+RSCTCOEFF   0
+EMITTANCE   0
+
+MATERIAL 7				//light
+RGB         1 1 1       
+SPECEX      0      
+SPECRGB     0 0 0       
+REFL        0       
+REFR        0        
+REFRIOR     0       
+SCATTER     0        
+ABSCOEFF    0 0 0      
+RSCTCOEFF   0
+EMITTANCE   1
+
+MATERIAL 8				//light
+RGB         0 1 0
+SPECEX      0      
+SPECRGB     0 0 0       
+REFL        0       
+REFR        0        
+REFRIOR     0       
+SCATTER     0        
+ABSCOEFF    0 0 0      
+RSCTCOEFF   0
+EMITTANCE   0.65
+
+
+MATERIAL 9				//light
+RGB         1 0 0
+SPECEX      0      
+SPECRGB     0 0 0       
+REFL        0       
+REFR        0        
+REFRIOR     0       
+SCATTER     0        
+ABSCOEFF    0 0 0      
+RSCTCOEFF   0
+EMITTANCE   0.75
+
+MATERIAL 10 				//bronze mirror
+RGB         0.8 0.5 0.195
+SPECEX      0      
+SPECRGB     1 1 1      
+REFL        1       
+REFR        0        
+REFRIOR     0       
+SCATTER     0        
+ABSCOEFF    0 0 0
+RSCTCOEFF   0
+EMITTANCE   0
+
+MATERIAL 11 				//Pure mirror
+RGB         1 1 1
+SPECEX      0      
+SPECRGB     1 1 1      
+REFL        1       
+REFR        0        
+REFRIOR     0       
+SCATTER     0        
+ABSCOEFF    0 0 0
+RSCTCOEFF   0
+EMITTANCE   0
+
+
+MATERIAL 12 				//blueish mirror
+RGB         0.75 0.75 1
+SPECEX      0      
+SPECRGB     1 1 1      
+REFL        1       
+REFR        0        
+REFRIOR     0       
+SCATTER     0        
+ABSCOEFF    0 0 0
+RSCTCOEFF   0
+EMITTANCE   0
+
+
+
+CAMERA
+RES         800 800
+FOVY        25
+ITERATIONS  5000
+FILE        refractionwithmirrors.bmp
+frame 0
+EYE         0 4.5 4.9
+VIEW        0 0 -1
+UP          0 1 0
+
+OBJECT 0
+cube
+material 0
+frame 0
+TRANS       0 0 0
+ROTAT       0 0 90
+SCALE       .01 10 10 
+
+OBJECT 1
+cube
+material 12
+frame 0
+TRANS       0 5 -5
+ROTAT       0 90 0
+SCALE       .01 10 10 
+
+OBJECT 2
+cube
+material 0
+frame 0
+TRANS       0 10 0
+ROTAT       0 0 90
+SCALE       .01 10 10
+
+OBJECT 3
+cube
+material 1
+frame 0
+TRANS       -5 5 0
+ROTAT       0 0 0
+SCALE       .01 10 10
+
+OBJECT 4
+cube
+material 2
+frame 0
+TRANS       5 5 0
+ROTAT       0 0 0
+SCALE       .01 10 10
+
+OBJECT 5
+sphere
+material 10
+frame 0
+TRANS       0 2 -1
+ROTAT       0 180 0
+SCALE       3 3 3
+
+OBJECT 6
+sphere
+material 11
+frame 0
+TRANS       2 5 1
+ROTAT       0 180 0
+SCALE       2.5 2.5 2.5
+
+OBJECT 7
+sphere
+material 6
+frame 0
+TRANS       -2 5 -3
+ROTAT       0 180 0
+SCALE       3 3 3
+
+
+OBJECT 8
+sphere
+material 7 
+frame 0
+TRANS       0 9.5 2
+ROTAT       0 0 90
+SCALE       0.5 2 2
+
+OBJECT 9
+sphere
+material 9
+frame 0
+TRANS       -4 1.5 3
+ROTAT       0 0 90
+SCALE       1 1 1
+
+
+OBJECT 10
+cube
+material 12
+frame 0
+TRANS       0 5 5
+ROTAT       0 90 0
+SCALE       .01 10 10 
+
+
+OBJECT 11
+sphere
+material 5
+frame 0
+TRANS       0 5 -1
+ROTAT       0 80 10
+SCALE       1.5 1.5 1.5
+
diff --git a/scenes/sampleScene.txt b/scenes/sampleScene.txt
index 52d079e..4282933 100755
--- a/scenes/sampleScene.txt
+++ b/scenes/sampleScene.txt
@@ -50,7 +50,7 @@ MATERIAL 4 				//white glossy
 RGB         1 1 1     
 SPECEX      0      
 SPECRGB     1 1 1      
-REFL        0       
+REFL        1       
 REFR        0        
 REFRIOR     2      
 SCATTER     0        
@@ -58,15 +58,15 @@ ABSCOEFF    0 0 0
 RSCTCOEFF   0
 EMITTANCE   0
 
-MATERIAL 5 				//glass
+MATERIAL 5 				//red tinted glass
 RGB         0 0 0    
 SPECEX      0      
 SPECRGB     1 1 1      
 REFL        0       
-REFR        1        
-REFRIOR     2.2       
+REFR        1     
+REFRIOR     1.5      
 SCATTER     0        
-ABSCOEFF    .02 5.1 5.7      
+ABSCOEFF    0 0 0    
 RSCTCOEFF   13
 EMITTANCE   0
 
@@ -104,19 +104,15 @@ REFRIOR     0
 SCATTER     0        
 ABSCOEFF    0 0 0      
 RSCTCOEFF   0
-EMITTANCE   15
+EMITTANCE   0
 
 CAMERA
-RES         800 800
+RES         600 600
 FOVY        25
 ITERATIONS  5000
 FILE        test.bmp
 frame 0
-EYE         0 4.5 12
-VIEW        0 0 -1
-UP          0 1 0
-frame 1
-EYE         0 4.5 12
+EYE         0 4.5 20
 VIEW        0 0 -1
 UP          0 1 0
 
@@ -127,10 +123,6 @@ frame 0
 TRANS       0 0 0
 ROTAT       0 0 90
 SCALE       .01 10 10 
-frame 1
-TRANS       0 0 0
-ROTAT       0 0 90
-SCALE       .01 10 10 
 
 OBJECT 1
 cube
@@ -139,10 +131,6 @@ frame 0
 TRANS       0 5 -5
 ROTAT       0 90 0
 SCALE       .01 10 10 
-frame 1
-TRANS       0 5 -5
-ROTAT       0 90 0
-SCALE       .01 10 10 
 
 OBJECT 2
 cube
@@ -150,11 +138,7 @@ material 0
 frame 0
 TRANS       0 10 0
 ROTAT       0 0 90
-SCALE       .01 10 10
-frame 1
-TRANS       0 10 0
-ROTAT       0 0 90
-SCALE       .01 10 10
+SCALE       .01 0.1 0.1
 
 OBJECT 3
 cube
@@ -163,10 +147,6 @@ frame 0
 TRANS       -5 5 0
 ROTAT       0 0 0
 SCALE       .01 10 10
-frame 1
-TRANS       -5 5 0
-ROTAT       0 0 0
-SCALE       .01 10 10
 
 OBJECT 4
 cube
@@ -175,45 +155,29 @@ frame 0
 TRANS       5 5 0
 ROTAT       0 0 0
 SCALE       .01 10 10
-frame 1
-TRANS       5 5 0
-ROTAT       0 0 0
-SCALE       .01 10 10
 
 OBJECT 5
 sphere
-material 4
+material 5
 frame 0
 TRANS       0 2 0
 ROTAT       0 180 0
 SCALE       3 3 3
-frame 1
-TRANS       0 2 0
-ROTAT       0 180 0
-SCALE       3 3 3
 
 OBJECT 6
 sphere
-material 3
+material 5
 frame 0
 TRANS       2 5 2
 ROTAT       0 180 0
 SCALE       2.5 2.5 2.5
-frame 1
-TRANS       2 5 2
-ROTAT       0 180 0
-SCALE       2.5 2.5 2.5
 
 OBJECT 7
 sphere
-material 6
+material 5
 frame 0
-TRANS       -2 5 -2
-ROTAT       0 180 0
-SCALE       3 3 3
-frame 1
-TRANS       -2 5 -2
-ROTAT       0 180 0
+TRANS       -2 5 0
+ROTAT       0 0 0
 SCALE       3 3 3
 
 OBJECT 8
@@ -223,7 +187,3 @@ frame 0
 TRANS       0 10 0
 ROTAT       0 0 90
 SCALE       .3 3 3
-frame 1
-TRANS       0 10 0
-ROTAT       0 0 90
-SCALE       .3 3 3
diff --git a/scenes/sampleScene2.txt b/scenes/sampleScene2.txt
new file mode 100644
index 0000000..43e7e7c
--- /dev/null
+++ b/scenes/sampleScene2.txt
@@ -0,0 +1,212 @@
+MATERIAL 0				//white diffuse
+RGB         1 1 1       
+SPECEX      0      
+SPECRGB     1 1 1      
+REFL        0       
+REFR        0        
+REFRIOR     0       
+SCATTER     0        
+ABSCOEFF    0 0 0      
+RSCTCOEFF   0
+EMITTANCE   0
+
+MATERIAL 1 				//red diffuse
+RGB         .63 .06 .04       
+SPECEX      0      
+SPECRGB     1 1 1      
+REFL        0       
+REFR        0        
+REFRIOR     0       
+SCATTER     0        
+ABSCOEFF    0 0 0      
+RSCTCOEFF   0
+EMITTANCE   0
+
+MATERIAL 2 				//green diffuse
+RGB         .15 .48 .09      
+SPECEX      0      
+SPECRGB     1 1 1      
+REFL        0       
+REFR        0        
+REFRIOR     0       
+SCATTER     0        
+ABSCOEFF    0 0 0      
+RSCTCOEFF   0
+EMITTANCE   0
+
+MATERIAL 3 				//red glossy
+RGB         .63 .06 .04      
+SPECEX      5      
+SPECRGB     1 1 1       
+REFL        0       
+REFR        0        
+REFRIOR     2       
+SCATTER     0        
+ABSCOEFF    0 0 0      
+RSCTCOEFF   0
+EMITTANCE   0
+
+MATERIAL 4 				//white glossy
+RGB         1 1 1     
+SPECEX      10      
+SPECRGB     1 1 1      
+REFL        0       
+REFR        0        
+REFRIOR     2      
+SCATTER     0        
+ABSCOEFF    0 0 0      
+RSCTCOEFF   0
+EMITTANCE   0
+
+MATERIAL 5 				//glass
+RGB         0 0 0    
+SPECEX      0      
+SPECRGB     1 1 1      
+REFL        0       
+REFR        1        
+REFRIOR     2.2       
+SCATTER     0        
+ABSCOEFF    .02 5.1 5.7      
+RSCTCOEFF   13
+EMITTANCE   0
+
+MATERIAL 6 				//green glossy
+RGB         .15 .48 .09      
+SPECEX      10      
+SPECRGB     1 1 1     
+REFL        0       
+REFR        0        
+REFRIOR     2.6       
+SCATTER     0        
+ABSCOEFF    0 0 0      
+RSCTCOEFF   0
+EMITTANCE   0
+
+MATERIAL 7				//light
+RGB         1 1 1       
+SPECEX      0      
+SPECRGB     0 0 0       
+REFL        0       
+REFR        0        
+REFRIOR     0       
+SCATTER     0        
+ABSCOEFF    0 0 0      
+RSCTCOEFF   0
+EMITTANCE   1
+
+MATERIAL 8				//light
+RGB         1 0 0       
+SPECEX      0      
+SPECRGB     0 0 0       
+REFL        0       
+REFR        0        
+REFRIOR     0       
+SCATTER     0        
+ABSCOEFF    0 0 0      
+RSCTCOEFF   0
+EMITTANCE   0.65
+
+
+MATERIAL 9				//light
+RGB         0 0 1
+SPECEX      0      
+SPECRGB     0 0 0       
+REFL        0       
+REFR        0        
+REFRIOR     0       
+SCATTER     0        
+ABSCOEFF    0 0 0      
+RSCTCOEFF   0
+EMITTANCE   0.65
+
+
+CAMERA
+RES         800 800
+FOVY        25
+ITERATIONS  5000
+FILE        multisource.bmp
+frame 0
+EYE         0 4.5 12
+VIEW        0 0 -1
+UP          0 1 0
+
+OBJECT 0
+cube
+material 0
+frame 0
+TRANS       0 0 0
+ROTAT       0 0 90
+SCALE       .01 10 10 
+
+OBJECT 1
+cube
+material 0
+frame 0
+TRANS       0 5 -5
+ROTAT       0 90 0
+SCALE       .01 10 10 
+
+OBJECT 2
+cube
+material 0
+frame 0
+TRANS       0 10 0
+ROTAT       0 0 90
+SCALE       .01 10 10
+
+OBJECT 3
+cube
+material 1
+frame 0
+TRANS       -5 5 0
+ROTAT       0 0 0
+SCALE       .01 10 10
+
+OBJECT 4
+cube
+material 2
+frame 0
+TRANS       5 5 0
+ROTAT       0 0 0
+SCALE       .01 10 10
+
+OBJECT 5
+sphere
+material 4
+frame 0
+TRANS       0 2 0
+ROTAT       0 180 0
+SCALE       3 3 3
+
+OBJECT 6
+sphere
+material 3
+frame 0
+TRANS       2 5 2
+ROTAT       0 180 0
+SCALE       2.5 2.5 2.5
+
+OBJECT 7
+sphere
+material 6
+frame 0
+TRANS       -2 5 -2
+ROTAT       0 180 0
+SCALE       3 3 3
+
+
+OBJECT 8
+sphere
+material 7 
+frame 0
+TRANS       -3 7 4
+ROTAT       0 0 90
+SCALE       1 1 1
+
+//OBJECT 9
+//sphere
+//material 8
+//frame 0
+//TRANS       3 1 -1
+//ROTAT       0 0 90
+//SCALE       1 1 1
diff --git a/scenes/stonehenge_boring.txt b/scenes/stonehenge_boring.txt
new file mode 100644
index 0000000..acbfba9
--- /dev/null
+++ b/scenes/stonehenge_boring.txt
@@ -0,0 +1,162 @@
+//Define global lighting conditions
+MATERIAL 	0 					//Daylight
+RGB         1 0.77 0.075       	//Emitter Color
+SPECEX      -1					//Flag to use as global light material				
+SPECRGB     0.66 0.81 0.94      //Direct lighting background color (daylight sky blue)
+REFL        0 //Must be zero
+REFR        0 //Must be zero       
+REFRIOR     0       
+SCATTER     0        	
+ABSCOEFF    1 1 1		//Diffuse shading light color (Full white light)
+RSCTCOEFF   0
+EMITTANCE   1
+
+MATERIAL 1				//stone
+RGB         0.3 0.3 0.2       
+SPECEX      0      
+SPECRGB     0 0 0      
+REFL        0       
+REFR        0        
+REFRIOR     0       
+SCATTER     0        
+ABSCOEFF    0 0 0      
+RSCTCOEFF   0
+EMITTANCE   0
+
+MATERIAL 2 				//grass
+RGB         .15 .5 .1       
+SPECEX      0      
+SPECRGB     0 0 0      
+REFL        0       
+REFR        0        
+REFRIOR     0       
+SCATTER     0        
+ABSCOEFF    0 0 0      
+RSCTCOEFF   0
+EMITTANCE   0
+
+MATERIAL 3 				//Deep Blue Water
+RGB         0 0 1      
+SPECEX      1e15      
+SPECRGB     1 1 1      
+REFL        0.9       
+REFR        0.9        
+REFRIOR     1.3       
+SCATTER     0        
+ABSCOEFF    1 1 0.5
+RSCTCOEFF   0
+EMITTANCE   0
+
+MATERIAL 4 				//Clear Water
+RGB         0 0 1      
+SPECEX      1e15      
+SPECRGB     1 1 1      
+REFL        1       
+REFR        1        
+REFRIOR     1.3       
+SCATTER     0        
+ABSCOEFF    0.3 0.3 0.1     
+RSCTCOEFF   0
+EMITTANCE   0
+
+MATERIAL 5 				//Cloudy Water
+RGB         0 0 1      
+SPECEX      1e5      
+SPECRGB     1 1 1      
+REFL        1       
+REFR        1        
+REFRIOR     1.3       
+SCATTER     0        
+ABSCOEFF    0.05 0.05 0.02      
+RSCTCOEFF   0
+EMITTANCE   0
+
+//Define global lighting conditions
+MATERIAL 	6 					//Moonlight
+RGB         1 1 1       	//Emitter Color
+SPECEX      -1					//Flag to use as global light material				
+SPECRGB     0.1 0.1 0.2       //Direct lighting background color (daylight sky blue)
+REFL        0 //Must be zero
+REFR        0 //Must be zero       
+REFRIOR     0       
+SCATTER     0        	
+ABSCOEFF    0.8 0.8 1		//Diffuse shading light color (Full white light)
+RSCTCOEFF   0
+EMITTANCE   0.8
+
+CAMERA
+RES         600 600
+FOVY        25
+ITERATIONS  500
+FILE        sundial3_moonlight.bmp
+frame 0
+EYE         0 25 75
+VIEW        0 0 -1
+UP          0 1 0
+
+//Global light
+OBJECT 0
+sphere
+material 6
+frame 0
+TRANS      -1e4 5e4 -3e4
+ROTAT       0 0 0
+SCALE       1e4 1e4 1e4
+
+//Bedrock
+OBJECT 1
+cube
+material 1
+frame 0
+TRANS       0 0 -0.05 
+ROTAT       0 0 0
+SCALE       1e5 0.1 1e5
+
+//Island
+OBJECT 2
+sphere
+material 2
+frame 0
+TRANS       0 0 0
+ROTAT       0 0 0
+SCALE       60 7.5 60
+
+//Water
+OBJECT 3
+cube
+material 3
+frame 0
+TRANS       0 0 0
+ROTAT       0 0 0
+SCALE       1e5 5 1e5
+
+//Center Altar top
+OBJECT 4
+cube
+material 1
+frame 0
+TRANS       0 7 0
+ROTAT       0 0 0
+SCALE       15 1 15
+
+
+//Center Altar bottom
+OBJECT 5
+cube
+material 1
+frame 0
+TRANS       0 5 0
+ROTAT       0 0 0
+SCALE       9.5 2 9.5
+
+
+//Pier Pilar
+OBJECT 6
+cube
+material 1
+frame 0
+TRANS       0 10 0
+ROTAT       0 0 0
+SCALE       1 10 1
+
+
diff --git a/screenshots/normal_debug.bmp b/screenshots/normal_debug.bmp
new file mode 100644
index 0000000..518de32
Binary files /dev/null and b/screenshots/normal_debug.bmp differ
diff --git a/screenshots/tracedepth_debug.bmp b/screenshots/tracedepth_debug.bmp
new file mode 100644
index 0000000..6fe124e
Binary files /dev/null and b/screenshots/tracedepth_debug.bmp differ
diff --git a/src/cudaAlgorithms.cu b/src/cudaAlgorithms.cu
new file mode 100644
index 0000000..d3d2f2b
--- /dev/null
+++ b/src/cudaAlgorithms.cu
@@ -0,0 +1,466 @@
+#include "cudaAlgorithms.h"
+
+template<typename DataType, typename FlagOperation>
+__global__ void	flagCheck(DataType* data, int* flagArray, int N, FlagOperation op)
+{
+
+	int blockIndex = blockIdx.x + blockIdx.y*gridDim.x;
+	int dataIndex  = threadIdx.x + blockIndex*blockDim.x;
+
+	if(dataIndex < N)
+	{
+		flagArray[dataIndex] = op(data[dataIndex])?1:0;
+	}
+}
+
+template<typename DataType>
+__global__ void scatter(DataType* streamIn, DataType* streamOut, int* indecies, int* flagArray, int N)
+{
+	int blockIndex = blockIdx.x + blockIdx.y*gridDim.x;
+	int dataIndex  = threadIdx.x + blockIndex*blockDim.x;
+
+	if(dataIndex < N)
+	{
+		if(flagArray[dataIndex])
+			streamOut[indecies[dataIndex]] = streamIn[dataIndex];
+	}
+}
+//Warning, this function allocates memory to fit the new array on the GPU.
+//Be sure to clean up after it
+template<typename DataType, typename FlagOperation>
+__host__ int streamCompaction(DataType* streamIn, DataType** streamOut, int N, FlagOperation op)
+{
+
+	int blockSize = MAX_BLOCK_DIM_X;
+	dim3 threadsPerBlock(blockSize);;
+	dim3 fullBlocksPerGrid;
+
+	int numBlocks = ceil(float(N)/(blockSize));
+	if(numBlocks > MAX_GRID_DIM_X){
+		fullBlocksPerGrid = dim3(MAX_GRID_DIM_X, (int)ceil( numBlocks / float(MAX_GRID_DIM_X)));
+	}else{
+		fullBlocksPerGrid = dim3(numBlocks);
+	}
+
+	//Create flag array
+	int* flagArray;
+	cudaMalloc((void**)&flagArray, N*sizeof(int));
+	int* indecies;
+	cudaMalloc((void**)&indecies, N*sizeof(int));
+
+	//Set flags
+	flagCheck<<<fullBlocksPerGrid, threadsPerBlock>>>(streamIn, flagArray, N, op);
+	//Sum Flags
+	int newN = exclusive_scan_sum(flagArray, indecies, N);
+
+	//Allocate new array
+	cudaMalloc((void**)streamOut, newN*sizeof(DataType));
+	if(newN > 0)
+	{
+		//Scatter
+		scatter<<<fullBlocksPerGrid, threadsPerBlock>>>(streamIn, *streamOut, indecies, flagArray, N);
+	}
+	cudaFree(indecies);
+	cudaFree(flagArray);
+	return newN;
+}
+
+
+template<typename DataType, typename BinaryOperation>
+__global__ void inclusive_scan_kernel(DataType* datain, DataType* dataout, DataType* blockResults, int N, BinaryOperation op)
+{
+	int blockIndex = blockIdx.x + blockIdx.y*gridDim.x;
+	//int dataIndex  = threadIdx.x + blockIndex*blockDim.x;
+	//Remember that we have two elements per thread.
+	int blockOffset = blockIndex * (blockDim.x*2);
+
+	int fullElements = N - blockOffset;
+	if(fullElements > (blockDim.x*2))
+		fullElements = blockDim.x*2;
+
+	DataType blockResult = inclusive_scan_block(&datain[blockOffset], &dataout[blockOffset], fullElements, op);
+
+	//Wait for results to come in
+	__syncthreads();
+
+	if(threadIdx.x == 0)
+	{
+		//Only have one thread write back the answer
+		dataout[blockOffset + fullElements - 1] = blockResult;
+		blockResults[blockIndex] = blockResult;
+	}
+
+}
+
+template<typename DataType, typename BinaryOperation>
+__global__ void exclusive_scan_kernel(DataType* datain, DataType* dataout, DataType* blockResults, int N, BinaryOperation op)
+{
+	int blockIndex = blockIdx.x + blockIdx.y*gridDim.x;
+	//int dataIndex  = threadIdx.x + blockIndex*blockDim.x;
+	//Remember that we have two elements per thread.
+	int blockOffset = blockIndex * (blockDim.x*2);
+
+	int fullElements = N - blockOffset;
+	if(fullElements > (blockDim.x*2))
+		fullElements = blockDim.x*2;
+
+	DataType blockResult = exclusive_scan_block(&datain[blockOffset], &dataout[blockOffset], fullElements, op);
+	//Wait for results to come in
+	__syncthreads();
+
+	if(threadIdx.x == 0)
+	{
+		//Only have one thread write back the answer
+		blockResults[blockIndex] = blockResult;
+	}
+
+}
+
+template<typename DataType, typename BinaryOperation>
+__global__ 	void scan_reintegrate_blocks(DataType* dataout, DataType* blockResults, int N, BinaryOperation op)
+{
+	int blockIndex = blockIdx.x + blockIdx.y*gridDim.x;
+	//Remember that we have two elements per thread.
+	int n = blockDim.x*2;
+	int blockOffset = blockIndex*(n);
+	int dataIndex1  = blockOffset + threadIdx.x;
+	int dataIndex2  = blockOffset + threadIdx.x + n/2;
+
+	//If in range, also ignore block 0, there's no data there
+	if(blockIndex > 0){
+		if(dataIndex1 < N)
+			dataout[dataIndex1] = op(blockResults[blockIndex], dataout[dataIndex1]);
+		if(dataIndex2 < N)
+			dataout[dataIndex2] = op(blockResults[blockIndex], dataout[dataIndex2]);
+	}
+}
+
+//Generic exclusive scan algorithm
+template<typename DataType, typename BinaryOperation>
+__host__ DataType exclusive_scan(DataType* datain, DataType* dataout, int N, BinaryOperation op)
+{
+	//Divide array into blocks
+	//TODO: Get this dynamically
+	int blockSize = MAX_BLOCK_DIM_X;
+	dim3 threadsPerBlock(blockSize);;
+	dim3 fullBlocksPerGrid;
+
+	int numBlocks = ceil(float(N)/(blockSize*2));//2 data elements per thread
+	if(numBlocks > MAX_GRID_DIM_X){
+		fullBlocksPerGrid = dim3(MAX_GRID_DIM_X, (int)ceil( numBlocks / float(MAX_GRID_DIM_X)));
+	}else{
+		fullBlocksPerGrid = dim3(numBlocks);
+	}
+
+	//Create an array to store results from each block
+	DataType* blockResults;
+	cudaMalloc((void**)&blockResults, numBlocks*sizeof(DataType));
+	exclusive_scan_kernel<<<fullBlocksPerGrid, threadsPerBlock, (2*blockSize+2)*sizeof(DataType)>>>(datain, dataout, blockResults, N, op);
+
+	DataType result;
+	if(numBlocks == 1)
+	{
+		//We've reached the bottom of the stack, grab the answer. Just one element
+		cudaMemcpy( &result, blockResults, sizeof(DataType), cudaMemcpyDeviceToHost);
+	}else{
+
+		result = exclusive_scan(blockResults, blockResults, numBlocks, op);
+		//sum in blockResults
+		scan_reintegrate_blocks<<<fullBlocksPerGrid, threadsPerBlock>>>(dataout, blockResults, N, op);
+	}
+	//Free block
+	cudaFree(blockResults);
+	return result;
+}
+
+
+
+template<typename DataType>
+__global__ void copy_array_kernel(DataType* datain, DataType* dataout, int N)
+{
+	int blockIndex = blockIdx.x + blockIdx.y*gridDim.x;
+	int blockOffset = blockIndex*blockDim.x;
+	int index = blockOffset+threadIdx.x;
+
+	dataout[index] = datain[index];
+}
+
+template<typename DataType>
+__global__ void exclusive_to_inclusive_kernel(DataType* datain, DataType* dataout, DataType result, int N)
+{
+	int blockIndex = blockIdx.x + blockIdx.y*gridDim.x;
+	int blockOffset = blockIndex*blockDim.x;
+	int indexOut = blockOffset+threadIdx.x;
+
+	if(indexOut < N){
+		if(indexOut < N - 1)
+			dataout[indexOut] = datain[indexOut+1];
+		else
+			dataout[indexOut] = result;//Last element of array
+	}
+	int test = dataout[indexOut];
+}
+
+
+//Shift exclusive to inclusive results
+template<typename DataType>
+__host__ void exclusive_to_inclusive(DataType* data, int N, DataType result)
+{
+	int blockSize = MAX_BLOCK_DIM_X;
+	dim3 threadsPerBlock(blockSize);;
+	dim3 fullBlocksPerGrid;
+
+	int numBlocks = ceil(float(N)/(blockSize));//1 data elements per thread
+	if(numBlocks > MAX_GRID_DIM_X){
+		fullBlocksPerGrid = dim3(MAX_GRID_DIM_X, (int)ceil( numBlocks / float(MAX_GRID_DIM_X)));
+	}else{
+		fullBlocksPerGrid = dim3(numBlocks);
+	}
+
+	//TODO: avoid the copy step.
+	DataType* cudatemp;
+	cudaMalloc((void**)&cudatemp, N*sizeof(DataType));
+	exclusive_to_inclusive_kernel<<<fullBlocksPerGrid, threadsPerBlock>>>(data, cudatemp, result, N);
+	copy_array_kernel<<<fullBlocksPerGrid, threadsPerBlock>>>(cudatemp, data, N);
+	cudaFree(cudatemp);
+
+}
+
+template<typename DataType, typename BinaryOperation>
+__host__ DataType inclusive_scan(DataType* datain, DataType* dataout, int N, BinaryOperation op)
+{
+
+	//Divide array into blocks
+	//TODO: Get this dynamically
+	int blockSize = MAX_BLOCK_DIM_X;
+	dim3 threadsPerBlock(blockSize);;
+	dim3 fullBlocksPerGrid;
+
+	int numBlocks = ceil(float(N)/(blockSize*2));//2 data elements per thread
+	if(numBlocks > MAX_GRID_DIM_X){
+		fullBlocksPerGrid = dim3(MAX_GRID_DIM_X, (int)ceil( numBlocks / float(MAX_GRID_DIM_X)));
+	}else{
+		fullBlocksPerGrid = dim3(numBlocks);
+	}
+
+	//Create an array to store results from each block
+	DataType* blockResults;
+	cudaMalloc((void**)&blockResults, numBlocks*sizeof(DataType));
+	inclusive_scan_kernel<<<fullBlocksPerGrid, threadsPerBlock, (2*blockSize+2)*sizeof(DataType)>>>(datain, dataout, blockResults, N, op);
+
+	DataType result;
+	if(numBlocks == 1)
+	{
+		//We've reached the bottom of the stack, grab the answer. Just one element
+		cudaMemcpy( &result, blockResults, sizeof(DataType), cudaMemcpyDeviceToHost);
+	}else{
+
+		result = inclusive_scan(blockResults, blockResults, numBlocks, op);
+		//sum in blockResults
+		scan_reintegrate_blocks<<<fullBlocksPerGrid, threadsPerBlock>>>(dataout, blockResults, N, op);
+	}
+	//Free block
+	cudaFree(blockResults);
+	return result;
+}
+
+
+
+template<typename DataType>
+__host__ DataType inclusive_scan_sum(DataType* datain, DataType* dataout, int N)
+{
+	Add add;
+	return inclusive_scan(datain, dataout, N, add);
+}
+
+
+template<typename DataType>
+__host__ DataType inclusive_scan_sum_wrapper(DataType* datain, DataType* dataout, int N)
+{
+	Add add;
+	return inclusive_scan_wrapper(datain, dataout, N, add);
+}
+
+
+template<typename DataType>
+__host__ DataType exclusive_scan_sum(DataType* datain, DataType* dataout, int N)
+{
+	Add add;
+	return exclusive_scan(datain, dataout, N, add);
+}
+
+
+//Does an exclusive scan in CUDA for a single block
+//Based on http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html
+//Allows in place scans by setting datain == dataout
+//Only works for an array ptr to device mem.
+//TODO: remove bank conflicts
+template<typename DataType, typename BinaryOperation>
+__device__ DataType exclusive_scan_block(DataType* datain, DataType* dataout, int N, BinaryOperation op)
+{  
+	extern __shared__ int temp[];
+	int index = threadIdx.x;  
+	int offset = 1;  
+	int n = 2*blockDim.x;//get actual temp padding
+	//Shared memory for access speed
+	//Get modified temp access
+	int ai = index;
+	int bi = index + n/2;
+	int bankOffsetA = CONFLICT_FREE_OFFSET(ai);
+	int bankOffsetB = CONFLICT_FREE_OFFSET(bi);
+
+	if(ai < N){
+		temp[ai+bankOffsetA] = datain[ai]; // load input into shared memory  
+	}else{
+		temp[ai+bankOffsetA] = 0;
+	}
+	if(bi < N){
+		temp[bi+bankOffsetB] = datain[bi];  
+	}else{
+		temp[bi+bankOffsetB] = 0;//if out of range, pad shared memory with junk (i.e. first element).
+	}
+
+	// build sum in place up the tree  
+	// d limits the number of active threads, halving it each iteration.
+	for (int d = n>>1; d > 0; d >>= 1)                  
+	{   
+		__syncthreads();  //Make sure previous step has completed
+		if (index < d)  
+		{
+			int ai2 = offset*(2*index+1)-1;  
+			int bi2 = offset*(2*index+2)-1;  
+			ai2 += CONFLICT_FREE_OFFSET(ai2);
+			bi2 += CONFLICT_FREE_OFFSET(bi2);
+
+			temp[bi2] = op(temp[ai2], temp[bi2]);  
+		}  
+		offset *= 2;  //Adjust offset
+	}
+	//Reduction step complete. 
+	__syncthreads();
+	DataType total = temp[(n-1)+CONFLICT_FREE_OFFSET(n-1)];
+	if (index == 0) { temp[(n - 1)+CONFLICT_FREE_OFFSET(n-1)] = 0; } // clear the last element in prep for down scan
+
+	//
+	for (int d = 1; d < n; d *= 2) // traverse down tree & build scan  
+	{  
+		offset >>= 1;  
+		__syncthreads();  //wait for previous step to finish
+		if (index < d)                       
+		{  
+			int ai2 = offset*(2*index+1)-1;  
+			int bi2 = offset*(2*index+2)-1;  
+			ai2 += CONFLICT_FREE_OFFSET(ai2);
+			bi2 += CONFLICT_FREE_OFFSET(bi2);
+
+			DataType t = temp[ai2];  
+			temp[ai2] = temp[bi2];  
+			temp[bi2] = op(temp[bi2], t);   
+		}  
+	}  
+	__syncthreads();  
+
+	//Store block scan result back to memory.
+	if(ai < N)//Don't write back if out of range
+		dataout[ai] = temp[ai+bankOffsetA]; // write results to device memory  
+	if(bi < N)
+		dataout[bi] = temp[bi+bankOffsetB];  
+
+	//Return last element of shared memory plus the last element of the array.
+	return total;
+}
+
+
+
+//Does an exclusive scan in CUDA for a single block
+//Based on http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html
+//Allows in place scans by setting datain == dataout
+//Only works for an array ptr to device mem.
+//TODO: remove bank conflicts
+template<typename DataType, typename BinaryOperation>
+__device__ DataType inclusive_scan_block(DataType* datain, DataType* dataout, int N, BinaryOperation op)
+{  
+	extern __shared__ int temp[];
+	int index = threadIdx.x;  
+	int offset = 1;  
+	int n = 2*blockDim.x;//get actual temp padding
+	//Shared memory for access speed
+	//Get modified temp access
+	int ai = index;
+	int bi = index + n/2;
+	int bankOffsetA = CONFLICT_FREE_OFFSET(ai);
+	int bankOffsetB = CONFLICT_FREE_OFFSET(bi);
+
+	if(ai < N){
+		temp[ai+bankOffsetA] = datain[ai]; // load input into shared memory  
+	}else{
+		temp[ai+bankOffsetA] = datain[0];
+	}
+	if(bi < N){
+		temp[bi+bankOffsetB] = datain[bi];  
+	}else{
+		temp[bi+bankOffsetB] = datain[0];//if out of range, pad shared memory with junk (i.e. first element).
+	}
+	__syncthreads();
+
+	//Pre load last element in block in case it gets overwritten later
+	DataType total =  temp[(N - 1)+CONFLICT_FREE_OFFSET(N-1)];
+
+	// build sum in place up the tree  
+	// d limits the number of active threads, halving it each iteration.
+	for (int d = n>>1; d > 0; d >>= 1)                  
+	{   
+		__syncthreads();  //Make sure previous step has completed
+		if (index < d)  
+		{
+			int ai2 = offset*(2*index+1)-1;  
+			int bi2 = offset*(2*index+2)-1;  
+			ai2 += CONFLICT_FREE_OFFSET(ai2);
+			bi2 += CONFLICT_FREE_OFFSET(bi2);
+
+			temp[bi2] = op(temp[ai2], temp[bi2]);  
+		}  
+		offset *= 2;  //Adjust offset
+	}
+	//Reduction step complete. 
+	__syncthreads();
+	if (index == 0) { temp[(n - 1)+CONFLICT_FREE_OFFSET(n-1)] = 0; } // clear the last element in prep for down scan
+
+	//
+	for (int d = 1; d < n; d *= 2) // traverse down tree & build scan  
+	{  
+		offset >>= 1;  
+		__syncthreads();  //wait for previous step to finish
+		if (index < d)                       
+		{  
+			int ai2 = offset*(2*index+1)-1;  
+			int bi2 = offset*(2*index+2)-1;  
+			ai2 += CONFLICT_FREE_OFFSET(ai2);
+			bi2 += CONFLICT_FREE_OFFSET(bi2);
+
+			DataType t = temp[ai2];  
+			temp[ai2] = temp[bi2];  
+			temp[bi2] = op(temp[bi2], t);   
+		}  
+	}  
+	__syncthreads();  
+
+	//Store block scan result back to memory.
+	if(ai > 0 && ai < N)//Don't write back if out of range
+		dataout[ai-1] = temp[ai+bankOffsetA]; // write results to device memory  
+	if(bi > 0 && bi < N)
+		dataout[bi-1] = temp[bi+bankOffsetB];  
+
+	//Return last element of shared memory plus the last element of the array.
+	return total + temp[(N - 1)+CONFLICT_FREE_OFFSET(N-1)];
+}
+
+
+///Explicit template instantiations. Do this to avoid code bloat in .h file.
+template int exclusive_scan_sum<int>(int*, int*, int);
+//template float exclusive_scan_sum<float>(float*, float*, int);
+
+template int inclusive_scan_sum<int>(int*, int*, int);
+//template float inclusive_scan_sum<float>(float*, float*, int);
+
+template int streamCompaction<rayState, RayAlive>(rayState*, rayState**, int, RayAlive);
diff --git a/src/cudaAlgorithms.h b/src/cudaAlgorithms.h
new file mode 100644
index 0000000..d98759c
--- /dev/null
+++ b/src/cudaAlgorithms.h
@@ -0,0 +1,67 @@
+#ifndef CUDA_ALGORITHMS_H
+#define CUDA_ALGORITHMS_H
+
+
+#include <stdio.h>
+#include <thrust/random.h>
+#include <cuda.h>
+#include <cmath>
+#include "sceneStructs.h"
+
+
+#if CUDA_VERSION >= 5000
+    #include <helper_math.h>
+#else
+    #include <cutil_math.h>
+#endif
+
+
+
+//For easy avoidance of bank conflicts
+#define NUM_BANKS 32
+#define LOG_NUM_BANKS 5 
+
+#define NO_BANK_CONFLICTS
+
+
+#ifdef NO_BANK_CONFLICTS
+#define CONFLICT_FREE_OFFSET(n)    \
+	(((n) >> (2 * LOG_NUM_BANKS)))  
+#else
+	#define CONFLICT_FREE_OFFSET(a)    (0)  
+#endif
+#define MAX_BLOCK_DIM_X 1024
+#define MAX_GRID_DIM_X 65535
+
+
+struct RayAlive : std::unary_function<rayState, bool>{
+	__host__ __device__ bool operator() (rayState r) {return r.index >= 0;}
+};
+
+struct Add : std::binary_function<float,float,float> {
+__host__ __device__   float operator() (float a, float b) {return (a+b);}
+};
+
+
+struct Multiply : std::binary_function<float,float,float> {
+__host__ __device__ float operator() (float a, float b) {return (a*b);}
+};
+
+
+
+template<typename DataType>
+__host__ DataType exclusive_scan_sum(DataType* datain, DataType* dataout, int N);
+template<typename DataType>
+__host__ DataType inclusive_scan_sum(DataType* datain, DataType* dataout, int N);
+
+template<typename DataType, typename BinaryOperation>
+__device__ DataType exclusive_scan_block(DataType* datain, DataType* dataout, int N, BinaryOperation op);
+template<typename DataType, typename BinaryOperation>
+__device__ DataType inclusive_scan_block(DataType* datain, DataType* dataout, int N, BinaryOperation op);
+
+template<typename DataType, typename FlagOperation>
+__host__ int streamCompaction(DataType* streamIn, DataType** streamOut, int N, FlagOperation op);
+
+
+
+#endif
\ No newline at end of file
diff --git a/src/interactions.h b/src/interactions.h
index 6561796..868506f 100755
--- a/src/interactions.h
+++ b/src/interactions.h
@@ -8,43 +8,237 @@
 
 #include "intersections.h"
 
+struct Fresnel {
+	float reflectionCoefficient;
+	float transmissionCoefficient;
+};
+
+struct AbsorptionAndScatteringProperties{
+	glm::vec3 absorptionCoefficient;
+	float reducedScatteringCoefficient;
+};
+
+
 //forward declaration
+//__host__ __device__ bool calculateScatterAndAbsorption(ray& r, float& depth, AbsorptionAndScatteringProperties& currentAbsorptionAndScattering, glm::vec3& unabsorbedColor, material m, float randomFloatForScatteringDistance, float randomFloat2, float randomFloat3);
 __host__ __device__ glm::vec3 getRandomDirectionInSphere(float xi1, float xi2);
+__host__ __device__ glm::vec3 calculateTransmission(glm::vec3 absorptionCoefficient, float distance);
+__host__ __device__ glm::vec3 calculateTransmissionDirection(glm::vec3 normal, glm::vec3 incident, float incidentIOR, float transmittedIOR);
+__host__ __device__ glm::vec3 calculateReflectionDirection(glm::vec3 normal, glm::vec3 incident);
+__host__ __device__ Fresnel calculateFresnel(glm::vec3 normal, glm::vec3 incident, float incidentIOR, float transmittedIOR, glm::vec3 reflectionDirection, glm::vec3 transmissionDirection);
 __host__ __device__ glm::vec3 calculateRandomDirectionInHemisphere(glm::vec3 normal, float xi1, float xi2);
+__host__ __device__ void bounceRay(rayState& r, glm::vec3 intersect, glm::vec3 normal, material* mats, int mhitIndex, float xi0/*for bounce type*/, float xi1/*for importance sampling*/,float xi2/*for importance sampling*/);
 
 //LOOK: This function demonstrates cosine weighted random direction generation in a sphere!
 __host__ __device__ glm::vec3 calculateRandomDirectionInHemisphere(glm::vec3 normal, float xi1, float xi2) {
-    
-    //crucial difference between this and calculateRandomDirectionInSphere: THIS IS COSINE WEIGHTED!
-    
-    float up = sqrt(xi1); // cos(theta)
-    float over = sqrt(1 - up * up); // sin(theta)
-    float around = xi2 * TWO_PI;
-    
-    //Find a direction that is not the normal based off of whether or not the normal's components are all equal to sqrt(1/3) or whether or not at least one component is less than sqrt(1/3). Learned this trick from Peter Kutz.
-    
-    glm::vec3 directionNotNormal;
-    if (abs(normal.x) < SQRT_OF_ONE_THIRD) {
-      directionNotNormal = glm::vec3(1, 0, 0);
-    } else if (abs(normal.y) < SQRT_OF_ONE_THIRD) {
-      directionNotNormal = glm::vec3(0, 1, 0);
-    } else {
-      directionNotNormal = glm::vec3(0, 0, 1);
-    }
-    
-    //Use not-normal direction to generate two perpendicular directions
-    glm::vec3 perpendicularDirection1 = glm::normalize(glm::cross(normal, directionNotNormal));
-    glm::vec3 perpendicularDirection2 = glm::normalize(glm::cross(normal, perpendicularDirection1));
-    
-    return ( up * normal ) + ( cos(around) * over * perpendicularDirection1 ) + ( sin(around) * over * perpendicularDirection2 );
-    
-}
-
-//TODO: IMPLEMENT THIS FUNCTION
-//Now that you know how cosine weighted direction generation works, try implementing non-cosine (uniform) weighted random direction generation.
-//This should be much easier than if you had to implement calculateRandomDirectionInHemisphere.
+
+	//crucial difference between this and calculateRandomDirectionInSphere: THIS IS COSINE WEIGHTED!
+
+	float up = sqrt(xi1); // cos(theta)
+	float over = sqrt(1 - up * up); // sin(theta)
+	float around = xi2 * TWO_PI;
+
+	//Find a direction that is not the normal based off of whether or not the normal's components are all equal to sqrt(1/3) or whether or not at least one component is less than sqrt(1/3). Learned this trick from Peter Kutz.
+
+	glm::vec3 directionNotNormal;
+	if (abs(normal.x) < SQRT_OF_ONE_THIRD) {
+		directionNotNormal = glm::vec3(1, 0, 0);
+	} else if (abs(normal.y) < SQRT_OF_ONE_THIRD) {
+		directionNotNormal = glm::vec3(0, 1, 0);
+	} else {
+		directionNotNormal = glm::vec3(0, 0, 1);
+	}
+
+	//Use not-normal direction to generate two perpendicular directions
+	glm::vec3 perpendicularDirection1 = glm::normalize(glm::cross(normal, directionNotNormal));
+	glm::vec3 perpendicularDirection2 = glm::normalize(glm::cross(normal, perpendicularDirection1));
+
+	return ( up * normal ) + ( cos(around) * over * perpendicularDirection1 ) + ( sin(around) * over * perpendicularDirection2 );
+
+}
+
+//Generates a random uniform direction in sphere. Note that this is a radially uniform distribution
 __host__ __device__ glm::vec3 getRandomDirectionInSphere(float xi1, float xi2) {
-  return glm::vec3(0,0,0);
+	float u = 2*(xi1-0.5);
+	float th = 2*PI*xi2;
+
+	glm::vec3 point;
+	float root = glm::sqrt(1-u*u);
+
+	//Find a uniform random point on a unit sphere and return it as a direction vector. Already normalized
+	point.x = root*glm::cos(th);
+	point.y = root*glm::sin(th);
+	point.z = u;
+
+	return point;
+}
+
+__host__ __device__ glm::vec3 sphericalToCartesian(float phi, float th)
+{
+	glm::vec3 dir;
+	dir.x = glm::cos(phi)*glm::sin(th);
+	dir.y = glm::sin(phi)*glm::sin(th);
+	dir.z = glm::cos(th);
+	return dir;
+}
+__host__ __device__ glm::vec3 sampleSpecularReflectionDirection(glm::vec3 normal, glm::vec3 reflectDir,  float specularExp, float xi1, float xi2)
+{
+	float th = glm::acos(glm::pow(xi1, 1/(specularExp+1)));
+	float phi = 2*PI*xi2;
+
+	glm::vec3 randDirZ = sphericalToCartesian(phi, th);
+	float dot = glm::dot(normal, reflectDir);
+
+	//Create rotation matrix
+	glm::vec3 zw = reflectDir;
+	glm::vec3 xw = glm::normalize(glm::cross(dot*normal, reflectDir));
+	glm::vec3 yw = glm::cross(zw, xw);
+	glm::mat3 rot = glm::mat3(xw, yw, zw);
+
+
+	return rot*randDirZ;
+}
+
+__host__ __device__ glm::vec3 sampleSpecularTransmissionDirection(glm::vec3 normal, glm::vec3 transmitDir, float specularExp, float xi1, float xi2)
+{
+
+	float th = glm::acos(glm::pow(xi1, 1/(specularExp+1)));
+	float phi = 2*PI*xi2;
+
+	glm::vec3 randDirZ = sphericalToCartesian(phi, th);
+	float dot = glm::dot(normal, transmitDir);
+	//Create rotation matrix
+	glm::vec3 zw = transmitDir;
+	glm::vec3 xw = glm::normalize(glm::cross(dot*normal, transmitDir));
+	glm::vec3 yw = glm::cross(zw, xw);
+	glm::mat3 rot = glm::mat3(xw, yw, zw);
+
+	//return transmitDir;
+	return rot*randDirZ;
+}
+
+__host__ __device__ Fresnel calculateFresnel(glm::vec3 normal, glm::vec3 incident, glm::vec3 transmitDir, glm::vec3 reflectDir, float n1/*incidentIOR*/, float n2/*transmittedIOR*/) {
+	Fresnel fresnel;
+	if(epsilonCheck(glm::length(transmitDir), 1.0, 0.001))
+	{
+		//Transmission possible
+		//Check if has reflection coefficient
+		if(epsilonCheck(glm::length(reflectDir), 1.0, 0.001))
+		{
+			//Has both, compute fresnel coefficients
+			//take absolute value to make the smallest angle
+			float cos_i = abs(glm::dot(normal,incident));
+			float cos_t = abs(glm::dot(normal,transmitDir));
+
+			float Rp = (n1*cos_i-n2*cos_t)/(n1*cos_i+n2*cos_t);
+			Rp *= Rp;
+			float Rs = (n2*cos_i-n1*cos_t)/(n2*cos_i+n1*cos_t);
+			Rs *= Rs;
+			fresnel.reflectionCoefficient = (Rs+Rp)/2;
+			fresnel.transmissionCoefficient = 1-fresnel.reflectionCoefficient;
+		}else{
+			fresnel.transmissionCoefficient = 1;
+			fresnel.reflectionCoefficient = 0;
+		}
+	}else{
+		fresnel.transmissionCoefficient = 0;
+		fresnel.reflectionCoefficient = 1;
+
+	}
+
+	return fresnel;
+}
+
+//compute absorbtion through transmitted material
+__host__ __device__ glm::vec3 calculateTransmission(glm::vec3 absorptionCoefficient, float distance)
+{
+	return glm::exp(-absorptionCoefficient*distance);
+}
+
+__host__ __device__ glm::vec3 calculateTransmissionDirection(glm::vec3 normal, glm::vec3 incident, float incidentIOR, float transmittedIOR)
+{
+	if(glm::dot(normal, incident) > 0.0)
+		normal = -normal;//If the normal is in the wrong direction, flip it
+
+	return glm::refract(incident, normal, incidentIOR/transmittedIOR);
+	
+
 }
 
+__host__ __device__ glm::vec3 calculateReflectionDirection(glm::vec3 normal, glm::vec3 incident) {
+	//nothing fancy here. Just a bounds check
+	if(glm::dot(normal, incident) > 0.0)
+		normal = -normal;//If the normal is in the wrong direction, flip it
+	return incident-(2.0f*glm::dot(normal, incident)) * normal;
+}
+
+
+//returns type of bounce that was performed
+//Takes three random numbers to use in sampling
+//Do not call this function if ray hit a light source.
+__host__ __device__ void bounceRay(rayState& r, renderOptions rconfig, glm::vec3 intersect, glm::vec3 normal, material* mats, int mhitIndex, float xi0/*for bounce type*/, float xi1/*for importance sampling*/,float xi2/*for importance sampling*/)
+{
+	material m = mats[mhitIndex];//material we hit
+
+	float mLastIOR;
+	if(r.matIndex >= 0 ){
+		mLastIOR = mats[r.matIndex].indexOfRefraction;//material we were traveling through
+	}
+	else{
+		mLastIOR = rconfig.airIOR;//material we were traveling through is open space
+	}
+
+	//phong inspired light model.
+	float ks = clamp(MAX(m.hasReflective, m.hasRefractive), 0.0f,1.0f);
+	//float kd = 1.0f-ks; //not actually needed, but implicit in this definition
+
+	//Specular or diffuse?
+	if(xi1 <= ks)
+	{
+		glm::vec3 reflectDir = glm::vec3(0,0,0);
+		glm::vec3 transmitDir = glm::vec3(0,0,0);
+
+		if(m.hasReflective){
+			reflectDir = calculateReflectionDirection(normal, r.r.direction);
+		}
+		if(m.hasRefractive){
+			transmitDir = calculateTransmissionDirection(normal, r.r.direction, mLastIOR, m.indexOfRefraction);
+		}
+
+		//compute fresnel coeffs
+		Fresnel f = calculateFresnel(normal, r.r.direction, transmitDir, reflectDir, mLastIOR, m.indexOfRefraction);
+
+		//scale our random number by ks
+		//0 <= xi1 <= ks, therefore 0 <= xi1/ks <= 1
+		if(xi1/ks <= f.reflectionCoefficient){
+			//reflect
+			r.r.direction = sampleSpecularReflectionDirection(normal, reflectDir, m.specularExponent, xi1, xi2);
+			r.r.origin = intersect;
+			r.T *= m.specularColor;
+			r.bounceType = REFLECT;
+		}else{
+			//transmit
+			r.r.direction = sampleSpecularTransmissionDirection(normal, transmitDir, m.specularExponent, xi1, xi2);
+			r.r.origin = intersect;
+			if(glm::dot(normal, r.r.direction) < 0.0)
+			{
+				//entering the material
+				r.matIndex = mhitIndex;
+			}else{
+				//exiting
+				r.matIndex = -1;
+			}
+			r.bounceType = TRANSMIT;
+		}
+	}else{
+
+		//Randomly select direction in hemisphere. Medium doesn't change. Accumulate diffuse refection color
+		r.r.direction = calculateRandomDirectionInHemisphere(normal, xi1, xi2);
+		r.r.origin = intersect;
+		r.T *= m.color;
+		r.bounceType = DIFFUSE;
+	}
+};
+
+
 #endif
diff --git a/src/intersections.h b/src/intersections.h
index a6b9469..c138e6e 100755
--- a/src/intersections.h
+++ b/src/intersections.h
@@ -42,6 +42,67 @@ __host__ __device__ bool epsilonCheck(float a, float b){
     }
 }
 
+__host__ __device__ bool epsilonCheck(float a, float b, float ep){
+    if(fabs(fabs(a)-fabs(b))<ep){
+        return true;
+    }else{
+        return false;
+    }
+}
+
+//Generic intersection wrapper that dispatches the correct test based on the geometry type
+__host__ __device__ float geomIntersectionTest(staticGeom geom, ray r, glm::vec3& intersectionPoint, glm::vec3& normal)
+{
+	switch(geom.type)
+	{
+	case SPHERE:
+		return sphereIntersectionTest(geom, r, intersectionPoint, normal); 
+	case CUBE:
+		return boxIntersectionTest(geom, r, intersectionPoint, normal); 
+		//return -1;
+	}
+
+	return -1;
+}
+
+
+//Function that traverses scene searching for collisions. Traces ray to first impact. Returns index of first geometry hit or -1 if no collision
+__host__ __device__ int firstIntersect(staticGeom* geoms, int numberOfGeoms, ray r, glm::vec3& intersectionPoint, glm::vec3& normal, float& distance)
+{
+	//Index of the first hit geometry
+	int firstGeomInd = -1;
+	distance = -1;
+	//Best intersection points stored in output params as minimums encountered. Limits temporary variables..
+	r.origin += r.direction*RAY_BIAS_AMOUNT;//March along ray a tiny bit to fix floating point errors
+
+
+	//for each geometry object
+	//TODO create better scene graph to improve collision detection for more complicated scenes. (Octtree)
+	for(int i = 0; i < numberOfGeoms; ++i)
+	{
+		//Temporary return variables
+		glm::vec3 intersectionPointTemp;
+		glm::vec3 normalTemp;
+
+		//Test for collision
+		float dist = geomIntersectionTest(geoms[i], r, intersectionPointTemp, normalTemp);
+		if(dist > 0){
+			//Impact detected
+			if(distance < 0 || dist < distance)
+			{
+				//First hit or closer hit
+				distance = dist;
+				firstGeomInd = i;
+				intersectionPoint = intersectionPointTemp;
+				normal = normalTemp;
+			}
+		}
+	}
+
+	return firstGeomInd;
+}
+
+
 //Self explanatory
 __host__ __device__ glm::vec3 getPointOnRay(ray r, float t){
   return r.origin + float(t-.0001)*glm::normalize(r.direction);
@@ -160,6 +221,7 @@ __host__ __device__  float boxIntersectionTest(glm::vec3 boxMin, glm::vec3 boxMa
 
 
     normal = multiplyMV(box.transform, glm::vec4(currentNormal,0.0));
+	normal = glm::normalize(normal);//DON'T FORGET TO NORMALIZE
     return glm::length(intersectionPoint-ro.origin);
 }
 
diff --git a/src/main.cpp b/src/main.cpp
index 81836b1..3ae5d4b 100755
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,87 +13,148 @@
 
 int main(int argc, char** argv){
 
-  #ifdef __APPLE__
-	  // Needed in OSX to force use of OpenGL3.2 
-	  glfwOpenWindowHint(GLFW_OPENGL_VERSION_MAJOR, 3);
-	  glfwOpenWindowHint(GLFW_OPENGL_VERSION_MINOR, 2);
-	  glfwOpenWindowHint(GLFW_OPENGL_FORWARD_COMPAT, GL_TRUE);
-	  glfwOpenWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);
-  #endif
-
-  // Set up pathtracer stuff
-  bool loadedScene = false;
-  finishedRender = false;
-
-  targetFrame = 0;
-  singleFrameMode = false;
-
-  // Load scene file
-  for(int i=1; i<argc; i++){
-    string header; string data;
-    istringstream liness(argv[i]);
-    getline(liness, header, '='); getline(liness, data, '=');
-    if(strcmp(header.c_str(), "scene")==0){
-      renderScene = new scene(data);
-      loadedScene = true;
-    }else if(strcmp(header.c_str(), "frame")==0){
-      targetFrame = atoi(data.c_str());
-      singleFrameMode = true;
-    }
-  }
-
-  if(!loadedScene){
-    cout << "Error: scene file needed!" << endl;
-    return 0;
-  }
-
-  // Set up camera stuff from loaded pathtracer settings
-  iterations = 0;
-  renderCam = &renderScene->renderCam;
-  width = renderCam->resolution[0];
-  height = renderCam->resolution[1];
-
-  if(targetFrame>=renderCam->frames){
-    cout << "Warning: Specified target frame is out of range, defaulting to frame 0." << endl;
-    targetFrame = 0;
-  }
-
-  // Launch CUDA/GL
-
-  #ifdef __APPLE__
+#ifdef __APPLE__
+	// Needed in OSX to force use of OpenGL3.2 
+	glfwOpenWindowHint(GLFW_OPENGL_VERSION_MAJOR, 3);
+	glfwOpenWindowHint(GLFW_OPENGL_VERSION_MINOR, 2);
+	glfwOpenWindowHint(GLFW_OPENGL_FORWARD_COMPAT, GL_TRUE);
+	glfwOpenWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);
+#endif
+
+
+
+	// Set up pathtracer stuff
+	bool loadedScene = false;
+	finishedRender = false;
+
+	targetFrame = 0;
+	singleFrameMode = false;
+
+	// Load scene file
+	for(int i=1; i<argc; i++){
+		string header; string data;
+		istringstream liness(argv[i]);
+		getline(liness, header, '='); getline(liness, data, '=');
+		if(strcmp(header.c_str(), "scene")==0){
+			renderScene = new scene(data);
+			loadedScene = true;
+		}else if(strcmp(header.c_str(), "frame")==0){
+			targetFrame = atoi(data.c_str());
+			singleFrameMode = true;
+		}
+	}
+
+	if(!loadedScene){
+		cout << "Error: scene file needed!" << endl;
+		return 0;
+	}
+
+	//Print CUDA device info
+	// Number of CUDA devices
+	int devCount;
+	cudaGetDeviceCount(&devCount);
+	printf("CUDA Device Query...\n");
+	printf("There are %d CUDA devices.\n", devCount);
+
+	// Iterate through devices
+	for (int i = 0; i < devCount; ++i)
+	{
+		// Get device properties
+		printf("\nCUDA Device #%d\n", i);
+		cudaDeviceProp devProp;
+		cudaGetDeviceProperties(&devProp, i);
+		utilityCore::printDevProp(devProp);
+	}
+
+	// Set up camera stuff from loaded pathtracer settings
+	iterations = 0;
+	frameFilterCounter = 0;
+	renderCam = &renderScene->renderCam;
+	width = renderCam->resolution[0];
+	height = renderCam->resolution[1];
+
+
+	//TODO: Set up rendering options
+	renderOpts = new renderOptions();
+	renderOpts->mode = TRACEDEPTH_DEBUG;
+	renderOpts->traceDepth = 50;
+	renderOpts->rayPoolSize =1.0f;//Size of pool relative to number of pixels. 1.0f means 1 ray per pixel
+	renderOpts->stocasticRayAssignment = false;
+	
+	//Defaults
+	renderOpts->globalLightGeomInd = -1;
+	//Setup global lighting conditions
+	for(int m = 0; m < renderScene->materials.size(); m++)
+	{
+		if(renderScene->materials[m].specularExponent < 0)//Flag for global light source
+		{
+			//If we have a global light material, look for a corresponding geom
+			for(int g = 0; g <  renderScene->objects.size(); g++)
+			{
+				if(renderScene->objects[g].materialid == m)
+				{
+					renderOpts->globalLightGeomInd = g;
+					break;
+				}
+			}
+			if(renderOpts->globalLightGeomInd > -1)
+				break;
+		}
+	}
+	
+
+	//Rendering toggle options
+	renderOpts->antialiasing = true;
+	renderOpts->streamCompaction = true;
+	renderOpts->frameFiltering = true;
+	renderOpts->globalShadows = true;
+
+
+	renderOpts->airIOR = 1.0;
+	renderOpts->airAbsorbtion = glm::vec3(0.0, 0.0, 0.0);//No air absorbtion effects for now
+	renderOpts->minT = 0.001;
+
+	if(targetFrame>=renderCam->frames){
+		cout << "Warning: Specified target frame is out of range, defaulting to frame 0." << endl;
+		targetFrame = 0;
+	}
+
+	// Launch CUDA/GL
+
+#ifdef __APPLE__
 	init();
-  #else
+#else
 	init(argc, argv);
-  #endif
+#endif
 
-  initCuda();
+	initCuda();
 
-  initVAO();
-  initTextures();
+	initVAO();
+	initTextures();
 
-  GLuint passthroughProgram;
-  passthroughProgram = initShader("shaders/passthroughVS.glsl", "shaders/passthroughFS.glsl");
+	GLuint passthroughProgram;
+	passthroughProgram = initShader("shaders/passthroughVS.glsl", "shaders/passthroughFS.glsl");
 
-  glUseProgram(passthroughProgram);
-  glActiveTexture(GL_TEXTURE0);
+	glUseProgram(passthroughProgram);
+	glActiveTexture(GL_TEXTURE0);
 
-  #ifdef __APPLE__
-	  // send into GLFW main loop
-	  while(1){
+#ifdef __APPLE__
+	// send into GLFW main loop
+	while(1){
 		display();
 		if (glfwGetKey(GLFW_KEY_ESC) == GLFW_PRESS || !glfwGetWindowParam( GLFW_OPENED )){
-				exit(0);
+			exit(0);
 		}
-	  }
+	}
 
-	  glfwTerminate();
-  #else
-	  glutDisplayFunc(display);
-	  glutKeyboardFunc(keyboard);
+	glfwTerminate();
+#else
+	glutDisplayFunc(display);
+	glutKeyboardFunc(keyboard);
 
-	  glutMainLoop();
-  #endif
-  return 0;
+	glutMainLoop();
+#endif
+	return 0;
 }
 
 //-------------------------------
@@ -101,133 +162,203 @@ int main(int argc, char** argv){
 //-------------------------------
 
 void runCuda(){
+	// Map OpenGL buffer object for writing from CUDA on a single GPU
+	// No data is moved (Win & Linux). When mapped to CUDA, OpenGL should not use this buffer
+
+	if(iterations<renderCam->iterations){
+		uchar4 *dptr=NULL;
+		iterations++;
+		frameFilterCounter++;
+		cudaGLMapBufferObject((void**)&dptr, pbo);
+
+		//pack geom and material arrays
+		geom* geoms = new geom[renderScene->objects.size()];
+		material* materials = new material[renderScene->materials.size()];
+
+		for(int i=0; i<renderScene->objects.size(); i++){
+			geoms[i] = renderScene->objects[i];
+		}
+		for(int i=0; i<renderScene->materials.size(); i++){
+			materials[i] = renderScene->materials[i];
+		}
+
+		//Measure frame rate
+		static clock_t tic;
+		clock_t toc = tic;
+		tic = clock();
+		//Slight Low pass filter to make FPS easier to read
+		fps = 0.2*fps + 0.8*CLOCKS_PER_SEC/float(tic-toc);
+
+		// execute the kernel
+
+		cudaRaytraceCore(dptr, renderCam, renderOpts, targetFrame, iterations, frameFilterCounter, materials, renderScene->materials.size(), geoms, renderScene->objects.size() );
+
+		// unmap buffer object
+		cudaGLUnmapBufferObject(pbo);
+	}else{
+
+		if(!finishedRender){
+			//output image file
+			image outputImage(renderCam->resolution.x, renderCam->resolution.y);
+
+			for(int x=0; x<renderCam->resolution.x; x++){
+				for(int y=0; y<renderCam->resolution.y; y++){
+					int index = x + (y * renderCam->resolution.x);
+					outputImage.writePixelRGB(renderCam->resolution.x-1-x,y,renderCam->image[index]/float(frameFilterCounter));
+				}
+			}
+
+			gammaSettings gamma;
+			gamma.applyGamma = true;
+			gamma.gamma = 1.25/1.5;
+			gamma.divisor = 1.0;
+			outputImage.setGammaSettings(gamma);
+			string filename = renderCam->imageName;
+			string s;
+			stringstream out;
+			out << targetFrame;
+			s = out.str();
+			utilityCore::replaceString(filename, ".bmp", "."+s+".bmp");
+			utilityCore::replaceString(filename, ".png", "."+s+".png");
+			outputImage.saveImageRGB(filename);
+			cout << "Saved frame " << s << " to " << filename << endl;
+			finishedRender = true;
+			if(singleFrameMode==true){
+				cudaDeviceReset(); 
+				exit(0);
+			}
+		}
+		if(targetFrame<renderCam->frames-1){
+
+			//clear image buffer and move onto next frame
+			targetFrame++;
+			iterations = 0;
+			for(int i=0; i<renderCam->resolution.x*renderCam->resolution.y; i++){
+				renderCam->image[i] = glm::vec3(0,0,0);
+			}
+			cudaDeviceReset(); 
+			finishedRender = false;
+		}
+	}
+
 
-  // Map OpenGL buffer object for writing from CUDA on a single GPU
-  // No data is moved (Win & Linux). When mapped to CUDA, OpenGL should not use this buffer
-  
-  if(iterations<renderCam->iterations){
-    uchar4 *dptr=NULL;
-    iterations++;
-    cudaGLMapBufferObject((void**)&dptr, pbo);
-  
-    //pack geom and material arrays
-    geom* geoms = new geom[renderScene->objects.size()];
-    material* materials = new material[renderScene->materials.size()];
-    
-    for(int i=0; i<renderScene->objects.size(); i++){
-      geoms[i] = renderScene->objects[i];
-    }
-    for(int i=0; i<renderScene->materials.size(); i++){
-      materials[i] = renderScene->materials[i];
-    }
-    
-  
-    // execute the kernel
-    cudaRaytraceCore(dptr, renderCam, targetFrame, iterations, materials, renderScene->materials.size(), geoms, renderScene->objects.size() );
-    
-    // unmap buffer object
-    cudaGLUnmapBufferObject(pbo);
-  }else{
-
-    if(!finishedRender){
-      //output image file
-      image outputImage(renderCam->resolution.x, renderCam->resolution.y);
-
-      for(int x=0; x<renderCam->resolution.x; x++){
-        for(int y=0; y<renderCam->resolution.y; y++){
-          int index = x + (y * renderCam->resolution.x);
-          outputImage.writePixelRGB(renderCam->resolution.x-1-x,y,renderCam->image[index]);
-        }
-      }
-      
-      gammaSettings gamma;
-      gamma.applyGamma = true;
-      gamma.gamma = 1.0;
-      gamma.divisor = 1.0; //renderCam->iterations;
-      outputImage.setGammaSettings(gamma);
-      string filename = renderCam->imageName;
-      string s;
-      stringstream out;
-      out << targetFrame;
-      s = out.str();
-      utilityCore::replaceString(filename, ".bmp", "."+s+".bmp");
-      utilityCore::replaceString(filename, ".png", "."+s+".png");
-      outputImage.saveImageRGB(filename);
-      cout << "Saved frame " << s << " to " << filename << endl;
-      finishedRender = true;
-      if(singleFrameMode==true){
-        cudaDeviceReset(); 
-        exit(0);
-      }
-    }
-    if(targetFrame<renderCam->frames-1){
-
-      //clear image buffer and move onto next frame
-      targetFrame++;
-      iterations = 0;
-      for(int i=0; i<renderCam->resolution.x*renderCam->resolution.y; i++){
-        renderCam->image[i] = glm::vec3(0,0,0);
-      }
-      cudaDeviceReset(); 
-      finishedRender = false;
-    }
-  }
-  
 }
 
 #ifdef __APPLE__
 
-	void display(){
-		runCuda();
+void display(){
+	runCuda();
 
-		string title = "CIS565 Render | " + utilityCore::convertIntToString(iterations) + " Iterations";
-		glfwSetWindowTitle(title.c_str());
+	string title = "CIS565 Render | " + utilityCore::convertIntToString(iterations) + " Iterations";
+	glfwSetWindowTitle(title.c_str());
 
-		glBindBuffer( GL_PIXEL_UNPACK_BUFFER, pbo);
-		glBindTexture(GL_TEXTURE_2D, displayImage);
-		glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, 
-			  GL_RGBA, GL_UNSIGNED_BYTE, NULL);
+	glBindBuffer( GL_PIXEL_UNPACK_BUFFER, pbo);
+	glBindTexture(GL_TEXTURE_2D, displayImage);
+	glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, 
+		GL_RGBA, GL_UNSIGNED_BYTE, NULL);
 
-		glClear(GL_COLOR_BUFFER_BIT);   
+	glClear(GL_COLOR_BUFFER_BIT);   
 
-		// VAO, shader program, and texture already bound
-		glDrawElements(GL_TRIANGLES, 6,  GL_UNSIGNED_SHORT, 0);
+	// VAO, shader program, and texture already bound
+	glDrawElements(GL_TRIANGLES, 6,  GL_UNSIGNED_SHORT, 0);
 
-		glfwSwapBuffers();
-	}
+	glfwSwapBuffers();
+}
 
 #else
 
-	void display(){
-		runCuda();
+void display(){
 
-		string title = "565Raytracer | " + utilityCore::convertIntToString(iterations) + " Iterations";
-		glutSetWindowTitle(title.c_str());
 
-		glBindBuffer( GL_PIXEL_UNPACK_BUFFER, pbo);
-		glBindTexture(GL_TEXTURE_2D, displayImage);
-		glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, 
-			  GL_RGBA, GL_UNSIGNED_BYTE, NULL);
+	runCuda();
 
-		glClear(GL_COLOR_BUFFER_BIT);   
 
-		// VAO, shader program, and texture already bound
-		glDrawElements(GL_TRIANGLES, 6,  GL_UNSIGNED_SHORT, 0);
+	string title = "565Pathtracer | " + utilityCore::convertIntToString(iterations) + " Iterations | FPS " + utilityCore::convertIntToString(fps);
+	glutSetWindowTitle(title.c_str());
 
-		glutPostRedisplay();
-		glutSwapBuffers();
-	}
+	glBindBuffer( GL_PIXEL_UNPACK_BUFFER, pbo);
+	glBindTexture(GL_TEXTURE_2D, displayImage);
+	glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, 
+		GL_RGBA, GL_UNSIGNED_BYTE, NULL);
 
-	void keyboard(unsigned char key, int x, int y)
+	glClear(GL_COLOR_BUFFER_BIT);   
+
+	// VAO, shader program, and texture already bound
+	glDrawElements(GL_TRIANGLES, 6,  GL_UNSIGNED_SHORT, 0);
+
+	glutPostRedisplay();
+	glutSwapBuffers();
+}
+
+void keyboard(unsigned char key, int x, int y)
+{
+	std::cout << key << std::endl;
+	switch (key) 
 	{
-		std::cout << key << std::endl;
-		switch (key) 
-		{
-		   case(27):
-			   exit(1);
-			   break;
-		}
+	case(27):
+		//Reset device to flush profiling data
+		cudaDeviceReset(); 
+		exit(0);
+		break;
+		///Mode selection options
+	case '1':
+		//Enter normal raytracing mode
+		renderOpts->mode = PATHTRACE;
+		cout << "Pathtracing Mode" <<endl;
+		break;
+	case '2':
+		//Enter distance debug mode
+		renderOpts->mode = RAYCOUNT_DEBUG;
+		cout << "Ray Count Debug Mode" <<endl;
+		break;
+	case '3':
+		//Enter trace depth debug mode
+		renderOpts->mode = TRACEDEPTH_DEBUG;
+		cout << "Trace Depth Debug Mode" <<endl;
+		break;
+	case '4':
+		//Show first hit color
+		renderOpts->mode = FIRST_HIT_DEBUG;
+		cout << "First Hit Debug Mode" <<endl;
+		break;
+	case '5':
+		//Enter normal debug mode
+		renderOpts->mode = NORMAL_DEBUG;
+		cout <<  "Normal Debug Mode" <<endl;
+		break;
+	case 'A':
+		renderOpts->antialiasing = !renderOpts->antialiasing;
+		cout << "Antialiasing: " << renderOpts->antialiasing<< endl;
+		break;
+	case 'S':
+		renderOpts->streamCompaction = !renderOpts->streamCompaction;
+		cout << "Stream Compaction: " << renderOpts->streamCompaction<< endl;
+		break;
+	case 'F':
+		renderOpts->frameFiltering = !renderOpts->frameFiltering;
+		frameFilterCounter = 0;
+		cout << "Frame Filter: " << renderOpts->frameFiltering<< endl;
+		break;
+	case 'G':
+		renderOpts->globalShadows = !renderOpts->globalShadows;
+		cout << "Global Shadows: " << renderOpts->globalShadows<< endl;
+		break;
+	case 'f':
+		frameFilterCounter = 0;
+		cout << "Frame Filter Reset" << endl;
+		break;
+	case '=':
+		renderOpts->traceDepth++;
+		cout << "Trace Depth: " << renderOpts->traceDepth << endl;
+		break;
+	case '-':
+		if(renderOpts->traceDepth > 1)
+			renderOpts->traceDepth--;
+		cout << "Trace Depth: " << renderOpts->traceDepth << endl;
+		break;
 	}
+	//TODO: Add more keyboard controls here
+}
 
 #endif
 
@@ -239,129 +370,130 @@ void runCuda(){
 //-------------------------------
 
 #ifdef __APPLE__
-	void init(){
+void init(){
 
-		if (glfwInit() != GL_TRUE){
-			shut_down(1);      
-		}
-
-		// 16 bit color, no depth, alpha or stencil buffers, windowed
-		if (glfwOpenWindow(width, height, 5, 6, 5, 0, 0, 0, GLFW_WINDOW) != GL_TRUE){
-			shut_down(1);
-		}
+	if (glfwInit() != GL_TRUE){
+		shut_down(1);      
+	}
 
-		// Set up vertex array object, texture stuff
-		initVAO();
-		initTextures();
+	// 16 bit color, no depth, alpha or stencil buffers, windowed
+	if (glfwOpenWindow(width, height, 5, 6, 5, 0, 0, 0, GLFW_WINDOW) != GL_TRUE){
+		shut_down(1);
 	}
-#else
-	void init(int argc, char* argv[]){
-		glutInit(&argc, argv);
-		glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGBA);
-		glutInitWindowSize(width, height);
-		glutCreateWindow("565Raytracer");
-
-		// Init GLEW
-		glewInit();
-		GLenum err = glewInit();
-		if (GLEW_OK != err)
-		{
-			/* Problem: glewInit failed, something is seriously wrong. */
-			std::cout << "glewInit failed, aborting." << std::endl;
-			exit (1);
-		}
 
-		initVAO();
-		initTextures();
+	// Set up vertex array object, texture stuff
+	initVAO();
+	initTextures();
+}
+#else
+void init(int argc, char* argv[]){
+	glutInit(&argc, argv);
+	glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGBA);
+	glutInitWindowSize(width, height);
+	glutCreateWindow("565Raytracer");
+
+	// Init GLEW
+	glewInit();
+	GLenum err = glewInit();
+	if (GLEW_OK != err)
+	{
+		/* Problem: glewInit failed, something is seriously wrong. */
+		std::cout << "glewInit failed, aborting." << std::endl;
+		exit (1);
 	}
+
+	initVAO();
+	initTextures();
+}
 #endif
 
 void initPBO(GLuint* pbo){
-  if (pbo) {
-    // set up vertex data parameter
-    int num_texels = width*height;
-    int num_values = num_texels * 4;
-    int size_tex_data = sizeof(GLubyte) * num_values;
-    
-    // Generate a buffer ID called a PBO (Pixel Buffer Object)
-    glGenBuffers(1,pbo);
-    // Make this the current UNPACK buffer (OpenGL is state-based)
-    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, *pbo);
-    // Allocate data for the buffer. 4-channel 8-bit image
-    glBufferData(GL_PIXEL_UNPACK_BUFFER, size_tex_data, NULL, GL_DYNAMIC_COPY);
-    cudaGLRegisterBufferObject( *pbo );
-  }
+	if (pbo) {
+		// set up vertex data parameter
+		int num_texels = width*height;
+		int num_values = num_texels * 4;
+		int size_tex_data = sizeof(GLubyte) * num_values;
+
+		// Generate a buffer ID called a PBO (Pixel Buffer Object)
+		glGenBuffers(1,pbo);
+		// Make this the current UNPACK buffer (OpenGL is state-based)
+		glBindBuffer(GL_PIXEL_UNPACK_BUFFER, *pbo);
+		// Allocate data for the buffer. 4-channel 8-bit image
+		glBufferData(GL_PIXEL_UNPACK_BUFFER, size_tex_data, NULL, GL_DYNAMIC_COPY);
+		cudaGLRegisterBufferObject( *pbo );
+	}
 }
 
 void initCuda(){
-  // Use device with highest Gflops/s
-  cudaGLSetGLDevice( compat_getMaxGflopsDeviceId() );
+	// Use device with highest Gflops/s
+	cudaGLSetGLDevice( compat_getMaxGflopsDeviceId() );
 
-  initPBO(&pbo);
+	initPBO(&pbo);
 
-  // Clean up on program exit
-  atexit(cleanupCuda);
+	// Clean up on program exit
+	atexit(cleanupCuda);
 
-  runCuda();
+
+	runCuda();
 }
 
 void initTextures(){
-    glGenTextures(1,&displayImage);
-    glBindTexture(GL_TEXTURE_2D, displayImage);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-    glTexImage2D( GL_TEXTURE_2D, 0, GL_RGBA8, width, height, 0, GL_BGRA,
-        GL_UNSIGNED_BYTE, NULL);
+	glGenTextures(1,&displayImage);
+	glBindTexture(GL_TEXTURE_2D, displayImage);
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+	glTexImage2D( GL_TEXTURE_2D, 0, GL_RGBA8, width, height, 0, GL_BGRA,
+		GL_UNSIGNED_BYTE, NULL);
 }
 
 void initVAO(void){
-    GLfloat vertices[] =
-    { 
-        -1.0f, -1.0f, 
-         1.0f, -1.0f, 
-         1.0f,  1.0f, 
-        -1.0f,  1.0f, 
-    };
-
-    GLfloat texcoords[] = 
-    { 
-        1.0f, 1.0f,
-        0.0f, 1.0f,
-        0.0f, 0.0f,
-        1.0f, 0.0f
-    };
-
-    GLushort indices[] = { 0, 1, 3, 3, 1, 2 };
-
-    GLuint vertexBufferObjID[3];
-    glGenBuffers(3, vertexBufferObjID);
-    
-    glBindBuffer(GL_ARRAY_BUFFER, vertexBufferObjID[0]);
-    glBufferData(GL_ARRAY_BUFFER, sizeof(vertices), vertices, GL_STATIC_DRAW);
-    glVertexAttribPointer((GLuint)positionLocation, 2, GL_FLOAT, GL_FALSE, 0, 0); 
-    glEnableVertexAttribArray(positionLocation);
-
-    glBindBuffer(GL_ARRAY_BUFFER, vertexBufferObjID[1]);
-    glBufferData(GL_ARRAY_BUFFER, sizeof(texcoords), texcoords, GL_STATIC_DRAW);
-    glVertexAttribPointer((GLuint)texcoordsLocation, 2, GL_FLOAT, GL_FALSE, 0, 0);
-    glEnableVertexAttribArray(texcoordsLocation);
-
-    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, vertexBufferObjID[2]);
-    glBufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(indices), indices, GL_STATIC_DRAW);
+	GLfloat vertices[] =
+	{ 
+		-1.0f, -1.0f, 
+		1.0f, -1.0f, 
+		1.0f,  1.0f, 
+		-1.0f,  1.0f, 
+	};
+
+	GLfloat texcoords[] = 
+	{ 
+		1.0f, 1.0f,
+		0.0f, 1.0f,
+		0.0f, 0.0f,
+		1.0f, 0.0f
+	};
+
+	GLushort indices[] = { 0, 1, 3, 3, 1, 2 };
+
+	GLuint vertexBufferObjID[3];
+	glGenBuffers(3, vertexBufferObjID);
+
+	glBindBuffer(GL_ARRAY_BUFFER, vertexBufferObjID[0]);
+	glBufferData(GL_ARRAY_BUFFER, sizeof(vertices), vertices, GL_STATIC_DRAW);
+	glVertexAttribPointer((GLuint)positionLocation, 2, GL_FLOAT, GL_FALSE, 0, 0); 
+	glEnableVertexAttribArray(positionLocation);
+
+	glBindBuffer(GL_ARRAY_BUFFER, vertexBufferObjID[1]);
+	glBufferData(GL_ARRAY_BUFFER, sizeof(texcoords), texcoords, GL_STATIC_DRAW);
+	glVertexAttribPointer((GLuint)texcoordsLocation, 2, GL_FLOAT, GL_FALSE, 0, 0);
+	glEnableVertexAttribArray(texcoordsLocation);
+
+	glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, vertexBufferObjID[2]);
+	glBufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(indices), indices, GL_STATIC_DRAW);
 }
 
 GLuint initShader(const char *vertexShaderPath, const char *fragmentShaderPath){
-    GLuint program = glslUtility::createProgram(vertexShaderPath, fragmentShaderPath, attributeLocations, 2);
-    GLint location;
+	GLuint program = glslUtility::createProgram(vertexShaderPath, fragmentShaderPath, attributeLocations, 2);
+	GLint location;
 
-    glUseProgram(program);
-    
-    if ((location = glGetUniformLocation(program, "u_image")) != -1)
-    {
-        glUniform1i(location, 0);
-    }
+	glUseProgram(program);
 
-    return program;
+	if ((location = glGetUniformLocation(program, "u_image")) != -1)
+	{
+		glUniform1i(location, 0);
+	}
+
+	return program;
 }
 
 //-------------------------------
@@ -369,30 +501,30 @@ GLuint initShader(const char *vertexShaderPath, const char *fragmentShaderPath){
 //-------------------------------
 
 void cleanupCuda(){
-  if(pbo) deletePBO(&pbo);
-  if(displayImage) deleteTexture(&displayImage);
+	if(pbo) deletePBO(&pbo);
+	if(displayImage) deleteTexture(&displayImage);
 }
 
 void deletePBO(GLuint* pbo){
-  if (pbo) {
-    // unregister this buffer object with CUDA
-    cudaGLUnregisterBufferObject(*pbo);
-    
-    glBindBuffer(GL_ARRAY_BUFFER, *pbo);
-    glDeleteBuffers(1, pbo);
-    
-    *pbo = (GLuint)NULL;
-  }
+	if (pbo) {
+		// unregister this buffer object with CUDA
+		cudaGLUnregisterBufferObject(*pbo);
+
+		glBindBuffer(GL_ARRAY_BUFFER, *pbo);
+		glDeleteBuffers(1, pbo);
+
+		*pbo = (GLuint)NULL;
+	}
 }
 
 void deleteTexture(GLuint* tex){
-    glDeleteTextures(1, tex);
-    *tex = (GLuint)NULL;
+	glDeleteTextures(1, tex);
+	*tex = (GLuint)NULL;
 }
- 
+
 void shut_down(int return_code){
-  #ifdef __APPLE__
+#ifdef __APPLE__
 	glfwTerminate();
-  #endif
-  exit(return_code);
+#endif
+	exit(return_code);
 }
diff --git a/src/main.h b/src/main.h
index 0bab7cb..1c999d0 100755
--- a/src/main.h
+++ b/src/main.h
@@ -15,7 +15,9 @@
 	#include <GL/glut.h>
 #endif
 
+
 #include <stdlib.h>
+#include <ctime>
 #include <cuda_runtime.h>
 #include <cuda_gl_interop.h>
 #include <string>
@@ -29,6 +31,7 @@
 #include "raytraceKernel.h"
 #include "utilities.h"
 #include "scene.h"
+#include "cudaAlgorithms.h"
 
 #if CUDA_VERSION >= 5000
     #include <helper_cuda.h>
@@ -42,14 +45,17 @@
 
 using namespace std;
 
+
 //-------------------------------
 //----------PATHTRACER-----------
 //-------------------------------
 
 scene* renderScene;
 camera* renderCam;
+renderOptions* renderOpts;
 int targetFrame;
 int iterations;
+int frameFilterCounter;
 bool finishedRender;
 bool singleFrameMode;
 
@@ -79,6 +85,7 @@ int main(int argc, char** argv);
 //---------RUNTIME STUFF---------
 //-------------------------------
 
+float fps;
 void runCuda();
 
 #ifdef __APPLE__
diff --git a/src/nvTools/nvToolsExt.h b/src/nvTools/nvToolsExt.h
new file mode 100644
index 0000000..87b9bba
--- /dev/null
+++ b/src/nvTools/nvToolsExt.h
@@ -0,0 +1,678 @@
+/*
+* Copyright 2009-2012  NVIDIA Corporation.  All rights reserved.
+*
+* NOTICE TO USER:
+*
+* This source code is subject to NVIDIA ownership rights under U.S. and
+* international Copyright laws.
+*
+* This software and the information contained herein is PROPRIETARY and
+* CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
+* of a form of NVIDIA software license agreement.
+*
+* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+* CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+* IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+* OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+* OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
+* OR PERFORMANCE OF THIS SOURCE CODE.
+*
+* U.S. Government End Users.   This source code is a "commercial item" as
+* that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
+* "commercial computer  software"  and "commercial computer software
+* documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
+* and is provided to the U.S. Government only as a commercial end item.
+* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+* source code with only those rights set forth herein.
+*
+* Any use of this source code in individual and commercial software must
+* include, in the user documentation and internal comments to the code,
+* the above Disclaimer and U.S. Government End Users Notice.
+*/
+
+/** \mainpage
+ * \section Introduction
+ * The NVIDIA Tools Extension library is a set of functions that a
+ * developer can use to provide additional information to tools.
+ * The additional information is used by the tool to improve
+ * analysis and visualization of data.
+ *
+ * The library introduces close to zero overhead if no tool is
+ * attached to the application.  The overhead when a tool is
+ * attached is specific to the tool.
+ */
+
+#ifndef NVTOOLSEXT_H_
+#define NVTOOLSEXT_H_
+
+#if defined(_MSC_VER) /* Microsoft Visual C++ Compiler */
+    #ifdef NVTX_EXPORTS
+        #define NVTX_DECLSPEC
+    #else
+        #define NVTX_DECLSPEC __declspec(dllimport)
+    #endif /* NVTX_EXPORTS */
+    #define NVTX_API __stdcall
+#else /* GCC and most other compilers */
+    #define NVTX_DECLSPEC
+    #define NVTX_API
+#endif /* Platform */
+
+/**
+ * The nvToolsExt library depends on stdint.h.  If the build tool chain in use
+ * does not include stdint.h then define NVTX_STDINT_TYPES_ALREADY_DEFINED
+ * and define the following types:
+ * <ul>
+ *   <li>uint8_t
+ *   <li>int8_t
+ *   <li>uint16_t
+ *   <li>int16_t
+ *   <li>uint32_t
+ *   <li>int32_t
+ *   <li>uint64_t
+ *   <li>int64_t
+ *   <li>uintptr_t
+ *   <li>intptr_t
+ * </ul>
+ #define NVTX_STDINT_TYPES_ALREADY_DEFINED if you are using your own header file.
+ */
+#ifndef NVTX_STDINT_TYPES_ALREADY_DEFINED
+#include <stdint.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/**
+ * Tools Extension API version
+ */
+#define NVTX_VERSION 1
+
+/**
+ * Size of the nvtxEventAttributes_t structure.
+ */
+#define NVTX_EVENT_ATTRIB_STRUCT_SIZE ( (uint16_t)( sizeof(nvtxEventAttributes_v1) ) )
+
+#define NVTX_NO_PUSH_POP_TRACKING ((int)-2)
+
+typedef uint64_t nvtxRangeId_t;
+
+/** \page EVENT_ATTRIBUTES Event Attributes
+ *
+ * \ref MARKER_AND_RANGES can be annotated with various attributes to provide
+ * additional information for an event or to guide the tool's visualization of
+ * the data. Each of the attributes is optional and if left unused the
+ * attributes fall back to a default value.
+ *
+ * To specify any attribute other than the text message, the \ref
+ * EVENT_ATTRIBUTE_STRUCTURE "Event Attribute Structure" must be used.
+ */
+
+/** ---------------------------------------------------------------------------
+ * Color Types
+ * ------------------------------------------------------------------------- */
+typedef enum nvtxColorType_t
+{
+    NVTX_COLOR_UNKNOWN  = 0,                 /**< Color attribute is unused. */
+    NVTX_COLOR_ARGB     = 1                  /**< An ARGB color is provided. */
+} nvtxColorType_t;
+
+/** ---------------------------------------------------------------------------
+ * Payload Types
+ * ------------------------------------------------------------------------- */
+typedef enum nvtxPayloadType_t
+{
+    NVTX_PAYLOAD_UNKNOWN                = 0,   /**< Color payload is unused. */
+    NVTX_PAYLOAD_TYPE_UNSIGNED_INT64    = 1,   /**< A unsigned integer value is used as payload. */
+    NVTX_PAYLOAD_TYPE_INT64             = 2,   /**< A signed integer value is used as payload. */
+    NVTX_PAYLOAD_TYPE_DOUBLE            = 3    /**< A floating point value is used as payload. */
+} nvtxPayloadType_t;
+
+/** ---------------------------------------------------------------------------
+ * Message Types
+ * ------------------------------------------------------------------------- */
+typedef enum nvtxMessageType_t
+{
+    NVTX_MESSAGE_UNKNOWN        = 0,         /**< Message payload is unused. */
+    NVTX_MESSAGE_TYPE_ASCII     = 1,         /**< A character sequence is used as payload. */
+    NVTX_MESSAGE_TYPE_UNICODE   = 2          /**< A wide character sequence is used as payload. */
+} nvtxMessageType_t;
+
+/** \brief Event Attribute Structure.
+ * \anchor EVENT_ATTRIBUTE_STRUCTURE
+ *
+ * This structure is used to describe the attributes of an event. The layout of
+ * the structure is defined by a specific version of the tools extension
+ * library and can change between different versions of the Tools Extension
+ * library.
+ *
+ * \par Initializing the Attributes
+ *
+ * The caller should always perform the following three tasks when using
+ * attributes:
+ * <ul>
+ *    <li>Zero the structure
+ *    <li>Set the version field
+ *    <li>Set the size field
+ * </ul>
+ *
+ * Zeroing the structure sets all the event attributes types and values
+ * to the default value.
+ *
+ * The version and size field are used by the Tools Extension
+ * implementation to handle multiple versions of the attributes structure.
+ *
+ * It is recommended that the caller use one of the following to methods
+ * to initialize the event attributes structure:
+ *
+ * \par Method 1: Initializing nvtxEventAttributes for future compatibility
+ * \code
+ * nvtxEventAttributes_t eventAttrib = {0};
+ * eventAttrib.version = NVTX_VERSION;
+ * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+ * \endcode
+ *
+ * \par Method 2: Initializing nvtxEventAttributes for a specific version
+ * \code
+ * nvtxEventAttributes_t eventAttrib = {0};
+ * eventAttrib.version = 1;
+ * eventAttrib.size = (uint16_t)(sizeof(nvtxEventAttributes_v1));
+ * \endcode
+ *
+ * If the caller uses Method 1 it is critical that the entire binary
+ * layout of the structure be configured to 0 so that all fields
+ * are initialized to the default value.
+ *
+ * The caller should either use both NVTX_VERSION and
+ * NVTX_EVENT_ATTRIB_STRUCT_SIZE (Method 1) or use explicit values
+ * and a versioned type (Method 2).  Using a mix of the two methods
+ * will likely cause either source level incompatibility or binary
+ * incompatibility in the future.
+ *
+ * \par Settings Attribute Types and Values
+ *
+ *
+ * \par Example:
+ * \code
+ * // Initialize
+ * nvtxEventAttributes_t eventAttrib = {0};
+ * eventAttrib.version = NVTX_VERSION;
+ * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+ *
+ * // Configure the Attributes
+ * eventAttrib.colorType = NVTX_COLOR_ARGB;
+ * eventAttrib.color = 0xFF880000;
+ * eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
+ * eventAttrib.message.ascii = "Example";
+ * \endcode
+ *
+ * In the example the caller does not have to set the value of
+ * \ref ::nvtxEventAttributes_v1::category or
+ * \ref ::nvtxEventAttributes_v1::payload as these fields were set to
+ * the default value by {0}.
+ * \sa
+ * ::nvtxMarkEx
+ * ::nvtxRangeStartEx
+ * ::nvtxRangePushEx
+ */
+typedef struct nvtxEventAttributes_v1
+{
+    /**
+    * \brief Version flag of the structure.
+    *
+    * Needs to be set to NVTX_VERSION to indicate the version of NVTX APIs
+    * supported in this header file. This can optionally be overridden to
+    * another version of the tools extension library.
+    */
+    uint16_t version;
+
+    /**
+    * \brief Size of the structure.
+    *
+    * Needs to be set to the size in bytes of the event attribute
+    * structure used to specify the event.
+    */
+    uint16_t size;
+
+    /**
+     * \brief ID of the category the event is assigned to.
+     *
+     * A category is a user-controlled ID that can be used to group
+     * events.  The tool may use category IDs to improve filtering or
+     * enable grouping of events in the same category. The functions
+     * \ref ::nvtxNameCategoryA or \ref ::nvtxNameCategoryW can be used
+     * to name a category.
+     *
+     * Default Value is 0
+     */
+    uint32_t category;
+
+    /** \brief Color type specified in this attribute structure.
+     *
+     * Defines the color format of the attribute structure's \ref COLOR_FIELD
+     * "color" field.
+     *
+     * Default Value is NVTX_COLOR_UNKNOWN
+     */
+    int32_t colorType;              /* nvtxColorType_t */
+
+    /** \brief Color assigned to this event. \anchor COLOR_FIELD
+     *
+     * The color that the tool should use to visualize the event.
+     */
+    uint32_t color;
+
+    /**
+     * \brief Payload type specified in this attribute structure.
+     *
+     * Defines the payload format of the attribute structure's \ref PAYLOAD_FIELD
+     * "payload" field.
+     *
+     * Default Value is NVTX_PAYLOAD_UNKNOWN
+     */
+    int32_t payloadType;            /* nvtxPayloadType_t */
+
+    int32_t reserved0;
+
+    /**
+     * \brief Payload assigned to this event. \anchor PAYLOAD_FIELD
+     *
+     * A numerical value that can be used to annotate an event. The tool could
+     * use the payload data to reconstruct graphs and diagrams.
+     */
+    union payload_t
+    {
+        uint64_t ullValue;
+        int64_t llValue;
+        double dValue;
+    } payload;
+
+    /** \brief Message type specified in this attribute structure.
+     *
+     * Defines the message format of the attribute structure's \ref MESSAGE_FIELD
+     * "message" field.
+     *
+     * Default Value is NVTX_MESSAGE_UNKNOWN
+     */
+    int32_t messageType;            /* nvtxMessageType_t */
+
+    /** \brief Message assigned to this attribute structure. \anchor MESSAGE_FIELD
+     *
+     * The text message that is attached to an event.
+     */
+    union message_t
+    {
+        const char* ascii;
+        const wchar_t* unicode;
+    } message;
+
+} nvtxEventAttributes_v1;
+
+typedef struct nvtxEventAttributes_v1 nvtxEventAttributes_t;
+
+/* ========================================================================= */
+/** \defgroup MARKER_AND_RANGES Marker and Ranges
+ *
+ * Markers and ranges are used to describe events at a specific time (markers)
+ * or over a time span (ranges) during the execution of the application
+ * respectively. The additional information is presented alongside all other
+ * captured data and facilitates understanding of the collected information.
+ */
+
+/* ========================================================================= */
+/** \name Markers
+ */
+/** \name Markers
+ */
+/** \addtogroup MARKER_AND_RANGES
+ * \section MARKER Marker
+ *
+ * A marker describes a single point in time.  A marker event has no side effect
+ * on other events.
+ *
+ * @{
+ */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Marks an instantaneous event in the application.
+ *
+ * A marker can contain a text message or specify additional information
+ * using the event attributes structure.  These attributes include a text
+ * message, color, category, and a payload. Each of the attributes is optional
+ * and can only be sent out using the \ref nvtxMarkEx function.
+ * If \ref nvtxMarkA or \ref nvtxMarkW are used to specify the the marker
+ * or if an attribute is unspecified then a default value will be used.
+ *
+ * \param eventAttrib - The event attribute structure defining the marker's
+ * attribute types and attribute values.
+ *
+ * \par Example:
+ * \code
+ * // zero the structure
+ * nvtxEventAttributes_t eventAttrib = {0};
+ * // set the version and the size information
+ * eventAttrib.version = NVTX_VERSION;
+ * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+ * // configure the attributes.  0 is the default for all attributes.
+ * eventAttrib.colorType = NVTX_COLOR_ARGB;
+ * eventAttrib.color = 0xFF880000;
+ * eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
+ * eventAttrib.message.ascii = "Example nvtxMarkEx";
+ * nvtxMarkEx(&eventAttrib);
+ * \endcode
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxMarkEx(const nvtxEventAttributes_t* eventAttrib);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Marks an instantaneous event in the application.
+ *
+ * A marker created using \ref nvtxMarkA or \ref nvtxMarkW contains only a
+ * text message.
+ *
+ * \param message     - The message associated to this marker event.
+ *
+ * \par Example:
+ * \code
+ * nvtxMarkA("Example nvtxMarkA");
+ * nvtxMarkW(L"Example nvtxMarkW");
+ * \endcode
+ *
+ * \version \NVTX_VERSION_0
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxMarkA(const char* message);
+NVTX_DECLSPEC void NVTX_API nvtxMarkW(const wchar_t* message);
+/** @} */
+
+/** @} */ /* END MARKER_AND_RANGES */
+
+/* ========================================================================= */
+/** \name Start/Stop Ranges
+ */
+/** \addtogroup MARKER_AND_RANGES
+ * \section INDEPENDENT_RANGES Start/Stop Ranges
+ *
+ * Start/Stop ranges denote a time span that can expose arbitrary concurrency -
+ * opposed to Push/Pop ranges that only support nesting. In addition the start
+ * of a range can happen on a different thread than the end. For the
+ * correlation of a start/end pair an unique correlation ID is used that is
+ * returned from the start API call and needs to be passed into the end API
+ * call.
+ *
+ * @{
+ */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Marks the start of a range.
+ *
+ * \param eventAttrib - The event attribute structure defining the range's
+ * attribute types and attribute values.
+ *
+ * \return The unique ID used to correlate a pair of Start and End events.
+ *
+ * \remarks Ranges defined by Start/End can overlap.
+ *
+ * \par Example:
+ * \code
+ * nvtxEventAttributes_t eventAttrib = {0};
+ * eventAttrib.version = NVTX_VERSION;
+ * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+ * eventAttrib.category = 3;
+ * eventAttrib.colorType = NVTX_COLOR_ARGB;
+ * eventAttrib.color = 0xFF0088FF;
+ * eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
+ * eventAttrib.message.ascii = "Example RangeStartEnd";
+ * nvtxRangeId_t rangeId = nvtxRangeStartEx(&eventAttrib);
+ * // ...
+ * nvtxRangeEnd(rangeId);
+ * \endcode
+ *
+ * \sa
+ * ::nvtxRangeEnd
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartEx(const nvtxEventAttributes_t* eventAttrib);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Marks the start of a range.
+ *
+ * \param message     - The event message associated to this range event.
+ *
+ * \return The unique ID used to correlate a pair of Start and End events.
+ *
+ * \remarks Ranges defined by Start/End can overlap.
+ *
+ * \par Example:
+ * \code
+ * nvtxRangeId_t r1 = nvtxRangeStartA("Range 1");
+ * nvtxRangeId_t r2 = nvtxRangeStartW(L"Range 2");
+ * nvtxRangeEnd(r1);
+ * nvtxRangeEnd(r2);
+ * \endcode
+ * \sa
+ * ::nvtxRangeEnd
+ *
+ * \version \NVTX_VERSION_0
+ * @{ */
+NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartA(const char* message);
+NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartW(const wchar_t* message);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Marks the end of a range.
+ *
+ * \param id - The correlation ID returned from a nvtxRangeStart call.
+ *
+ * \sa
+ * ::nvtxRangeStartEx
+ * ::nvtxRangeStartA
+ * ::nvtxRangeStartW
+ *
+ * \version \NVTX_VERSION_0
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxRangeEnd(nvtxRangeId_t id);
+/** @} */
+
+/** @} */
+
+
+/* ========================================================================= */
+/** \name Push/Pop Ranges
+ */
+/** \addtogroup MARKER_AND_RANGES
+ * \section PUSH_POP_RANGES Push/Pop Ranges
+ *
+ * Push/Pop ranges denote nested time ranges. Nesting is maintained per thread
+ * and does not require any additional correlation mechanism. The duration of a
+ * push/pop range is defined by the corresponding pair of Push/Pop API calls.
+ *
+ * @{
+ */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Marks the start of a nested range
+ *
+ * \param eventAttrib - The event attribute structure defining the range's
+ * attribute types and attribute values.
+ *
+ * \return The 0 based level of range being started.  If an error occurs a
+ * negative value is returned.
+ *
+ * \par Example:
+ * \code
+ * nvtxEventAttributes_t eventAttrib = {0};
+ * eventAttrib.version = NVTX_VERSION;
+ * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+ * eventAttrib.colorType = NVTX_COLOR_ARGB;
+ * eventAttrib.color = 0xFFFF0000;
+ * eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
+ * eventAttrib.message.ascii = "Level 0";
+ * nvtxRangePushEx(&eventAttrib);
+ *
+ * // Re-use eventAttrib
+ * eventAttrib.messageType = NVTX_MESSAGE_TYPE_UNICODE;
+ * eventAttrib.message.unicode = L"Level 1";
+ * nvtxRangePushEx(&eventAttrib);
+ *
+ * nvtxRangePop();
+ * nvtxRangePop();
+ * \endcode
+ *
+ * \sa
+ * ::nvtxRangePop
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC int NVTX_API nvtxRangePushEx(const nvtxEventAttributes_t* eventAttrib);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Marks the start of a nested range
+ *
+ * \param message     - The event message associated to this range event.
+ *
+ * \return The 0 based level of range being started.  If an error occurs a
+ * negative value is returned.
+ *
+ * \par Example:
+ * \code
+ * nvtxRangePushA("Level 0");
+ * nvtxRangePushW(L"Level 1");
+ * nvtxRangePop();
+ * nvtxRangePop();
+ * \endcode
+ *
+ * \sa
+ * ::nvtxRangePop
+ *
+ * \version \NVTX_VERSION_0
+ * @{ */
+NVTX_DECLSPEC int NVTX_API nvtxRangePushA(const char* message);
+NVTX_DECLSPEC int NVTX_API nvtxRangePushW(const wchar_t* message);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Marks the end of a nested range
+ *
+ * \return The level of the range being ended. If an error occurs a negative
+ * value is returned on the current thread.
+ *
+ * \sa
+ * ::nvtxRangePushEx
+ * ::nvtxRangePushA
+ * ::nvtxRangePushW
+ *
+ * \version \NVTX_VERSION_0
+ * @{ */
+NVTX_DECLSPEC int NVTX_API nvtxRangePop(void);
+/** @} */
+
+/** @} */
+
+/* ========================================================================= */
+/** \defgroup RESOURCE_NAMING Resource Naming
+ *
+ * This section covers calls that allow to annotate objects with user-provided
+ * names in order to allow for a better analysis of complex trace data. All of
+ * the functions take the handle or the ID of the object to name and the name.
+ * The functions can be called multiple times during the execution of an
+ * application, however, in that case it is implementation dependent which
+ * name will be reported by the tool.
+ *
+ * \section RESOURCE_NAMING_NVTX NVTX Resource Naming
+ * The NVIDIA Tools Extension library allows to attribute events with additional
+ * information such as category IDs. These category IDs can be annotated with
+ * user-provided names using the respective resource naming functions.
+ *
+ * \section RESOURCE_NAMING_OS OS Resource Naming
+ * In order to enable a tool to report system threads not just by their thread
+ * identifier, the NVIDIA Tools Extension library allows to provide user-given
+ * names to these OS resources.
+ * @{
+ */
+
+/* ------------------------------------------------------------------------- */
+/** \name Functions for NVTX Resource Naming
+ */
+/** @{
+ * \brief Annotate an NVTX category.
+ *
+ * Categories are used to group sets of events. Each category is identified
+ * through a unique ID and that ID is passed into any of the marker/range
+ * events to assign that event to a specific category. The nvtxNameCategory
+ * function calls allow the user to assign a name to a category ID.
+ *
+ * \param category - The category ID to name.
+ * \param name     - The name of the category.
+ *
+ * \remarks The category names are tracked per process.
+ *
+ * \par Example:
+ * \code
+ * nvtxNameCategory(1, "Memory Allocation");
+ * nvtxNameCategory(2, "Memory Transfer");
+ * nvtxNameCategory(3, "Memory Object Lifetime");
+ * \endcode
+ *
+ * \version \NVTX_VERSION_1
+ */
+NVTX_DECLSPEC void NVTX_API nvtxNameCategoryA(uint32_t category, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCategoryW(uint32_t category, const wchar_t* name);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \name Functions for OS Resource Naming
+ */
+/** @{
+ * \brief Annotate an OS thread.
+ *
+ * Allows the user to name an active thread of the current process. If an
+ * invalid thread ID is provided or a thread ID from a different process is
+ * used the behavior of the tool is implementation dependent.
+ *
+ * \param threadId - The ID of the thread to name.
+ * \param name     - The name of the thread.
+ *
+ * \par Example:
+ * \code
+ * nvtxNameOsThread(GetCurrentThreadId(), "MAIN_THREAD");
+ * \endcode
+ *
+ * \version \NVTX_VERSION_1
+ */
+NVTX_DECLSPEC void NVTX_API nvtxNameOsThreadA(uint32_t threadId, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameOsThreadW(uint32_t threadId, const wchar_t* name);
+/** @} */
+
+/** @} */ /* END RESOURCE_NAMING */
+
+/* ========================================================================= */
+
+#ifdef UNICODE
+    #define nvtxMark            nvtxMarkW
+    #define nvtxRangeStart      nvtxRangeStartW
+    #define nvtxRangePush       nvtxRangePushW
+    #define nvtxNameCategory    nvtxNameCategoryW
+    #define nvtxNameOsThread    nvtxNameOsThreadW
+#else
+    #define nvtxMark            nvtxMarkA
+    #define nvtxRangeStart      nvtxRangeStartA
+    #define nvtxRangePush       nvtxRangePushA
+    #define nvtxNameCategory    nvtxNameCategoryA
+    #define nvtxNameOsThread    nvtxNameOsThreadA
+#endif
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* NVTOOLSEXT_H_ */
diff --git a/src/nvTools/nvToolsExtCuda.h b/src/nvTools/nvToolsExtCuda.h
new file mode 100644
index 0000000..e53874f
--- /dev/null
+++ b/src/nvTools/nvToolsExtCuda.h
@@ -0,0 +1,144 @@
+/*
+* Copyright 2009-2012  NVIDIA Corporation.  All rights reserved.
+*
+* NOTICE TO USER:
+*
+* This source code is subject to NVIDIA ownership rights under U.S. and
+* international Copyright laws.
+*
+* This software and the information contained herein is PROPRIETARY and
+* CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
+* of a form of NVIDIA software license agreement.
+*
+* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+* CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+* IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+* OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+* OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
+* OR PERFORMANCE OF THIS SOURCE CODE.
+*
+* U.S. Government End Users.   This source code is a "commercial item" as
+* that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
+* "commercial computer  software"  and "commercial computer software
+* documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
+* and is provided to the U.S. Government only as a commercial end item.
+* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+* source code with only those rights set forth herein.
+*
+* Any use of this source code in individual and commercial software must
+* include, in the user documentation and internal comments to the code,
+* the above Disclaimer and U.S. Government End Users Notice.
+*/
+
+#ifndef NVTOOLSEXT_CUDA_H_
+#define NVTOOLSEXT_CUDA_H_
+
+#include "cuda.h"
+
+#include "nvToolsExt.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/* ========================================================================= */
+/** \name Functions for CUDA Resource Naming
+*/
+/** \addtogroup RESOURCE_NAMING
+ * \section RESOURCE_NAMING_CUDA CUDA Resource Naming
+ *
+ * This section covers the API functions that allow to annotate CUDA resources
+ * with user-provided names.
+ *
+ * @{
+ */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates a CUDA device.
+ *
+ * Allows the user to associate a CUDA device with a user-provided name.
+ *
+ * \param device - The handle of the CUDA device to name.
+ * \param name   - The name of the CUDA device.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates a CUDA context.
+ *
+ * Allows the user to associate a CUDA context with a user-provided name.
+ *
+ * \param context - The handle of the CUDA context to name.
+ * \param name    - The name of the CUDA context.
+ *
+ * \par Example:
+ * \code
+ * CUresult status = cuCtxCreate( &cuContext, 0, cuDevice );
+ * if ( CUDA_SUCCESS != status )
+ *     goto Error;
+ * nvtxNameCuContext(cuContext, "CTX_NAME");
+ * \endcode
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates a CUDA stream.
+ *
+ * Allows the user to associate a CUDA stream with a user-provided name.
+ *
+ * \param stream - The handle of the CUDA stream to name.
+ * \param name   - The name of the CUDA stream.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates a CUDA event.
+ *
+ * Allows the user to associate a CUDA event with a user-provided name.
+ *
+ * \param event - The handle of the CUDA event to name.
+ * \param name  - The name of the CUDA event.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name);
+/** @} */
+
+/** @} */ /* END RESOURCE_NAMING */
+
+/* ========================================================================= */
+#ifdef UNICODE
+  #define nvtxNameCuDevice   nvtxNameCuDeviceW
+  #define nvtxNameCuContext  nvtxNameCuContextW
+  #define nvtxNameCuStream   nvtxNameCuStreamW
+  #define nvtxNameCuEvent    nvtxNameCuEventW
+#else
+  #define nvtxNameCuDevice   nvtxNameCuDeviceA
+  #define nvtxNameCuContext  nvtxNameCuContextA
+  #define nvtxNameCuStream   nvtxNameCuStreamA
+  #define nvtxNameCuEvent    nvtxNameCuEventA
+#endif
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* NVTOOLSEXT_CUDA_H_ */
diff --git a/src/nvTools/nvToolsExtCudaRt.h b/src/nvTools/nvToolsExtCudaRt.h
new file mode 100644
index 0000000..6def6a5
--- /dev/null
+++ b/src/nvTools/nvToolsExtCudaRt.h
@@ -0,0 +1,121 @@
+/*
+* Copyright 2012  NVIDIA Corporation.  All rights reserved.
+*
+* NOTICE TO USER:
+*
+* This source code is subject to NVIDIA ownership rights under U.S. and
+* international Copyright laws.
+*
+* This software and the information contained herein is PROPRIETARY and
+* CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
+* of a form of NVIDIA software license agreement.
+*
+* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+* CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+* IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+* OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+* OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
+* OR PERFORMANCE OF THIS SOURCE CODE.
+*
+* U.S. Government End Users.   This source code is a "commercial item" as
+* that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
+* "commercial computer  software"  and "commercial computer software
+* documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
+* and is provided to the U.S. Government only as a commercial end item.
+* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+* source code with only those rights set forth herein.
+*
+* Any use of this source code in individual and commercial software must
+* include, in the user documentation and internal comments to the code,
+* the above Disclaimer and U.S. Government End Users Notice.
+*/
+
+#ifndef NVTOOLSEXT_CUDART_H_
+#define NVTOOLSEXT_CUDART_H_
+
+#include "cuda.h"
+#include "driver_types.h"
+
+#include "nvToolsExt.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/* ========================================================================= */
+/** \name Functions for CUDA Resource Naming
+*/
+/** \addtogroup RESOURCE_NAMING
+ * \section RESOURCE_NAMING_CUDA CUDA Resource Naming
+ *
+ * This section covers the API functions that allow to annotate CUDA resources
+ * with user-provided names.
+ *
+ * @{
+ */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates a CUDA device.
+ *
+ * Allows the user to associate a CUDA device with a user-provided name.
+ *
+ * \param device - The id of the CUDA device to name.
+ * \param name   - The name of the CUDA device.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceA(int device, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates a CUDA stream.
+ *
+ * Allows the user to associate a CUDA stream with a user-provided name.
+ *
+ * \param stream - The handle of the CUDA stream to name.
+ * \param name   - The name of the CUDA stream.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates a CUDA event.
+ *
+ * Allows the user to associate a CUDA event with a user-provided name.
+ *
+ * \param event - The handle of the CUDA event to name.
+ * \param name  - The name of the CUDA event.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name);
+/** @} */
+
+/** @} */ /* END RESOURCE_NAMING */
+
+/* ========================================================================= */
+#ifdef UNICODE
+  #define nvtxNameCudaDevice nvtxNameCudaDeviceW
+  #define nvtxNameCudaStream nvtxNameCudaStreamW
+  #define nvtxNameCudaEvent  nvtxNameCudaEventW
+#else
+  #define nvtxNameCudaDevice nvtxNameCudaDeviceA
+  #define nvtxNameCudaStream nvtxNameCudaStreamA
+  #define nvtxNameCudaEvent  nvtxNameCudaEventA
+#endif
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* NVTOOLSEXT_CUDART_H_ */
diff --git a/src/nvTools/nvToolsExtOpenCL.h b/src/nvTools/nvToolsExtOpenCL.h
new file mode 100644
index 0000000..8f652dc
--- /dev/null
+++ b/src/nvTools/nvToolsExtOpenCL.h
@@ -0,0 +1,191 @@
+/*
+* Copyright 2009-2012  NVIDIA Corporation.  All rights reserved.
+*
+* NOTICE TO USER:
+*
+* This source code is subject to NVIDIA ownership rights under U.S. and
+* international Copyright laws.
+*
+* This software and the information contained herein is PROPRIETARY and
+* CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
+* of a form of NVIDIA software license agreement.
+*
+* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+* CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+* IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+* OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+* OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
+* OR PERFORMANCE OF THIS SOURCE CODE.
+*
+* U.S. Government End Users.   This source code is a "commercial item" as
+* that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
+* "commercial computer  software"  and "commercial computer software
+* documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
+* and is provided to the U.S. Government only as a commercial end item.
+* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+* source code with only those rights set forth herein.
+*
+* Any use of this source code in individual and commercial software must
+* include, in the user documentation and internal comments to the code,
+* the above Disclaimer and U.S. Government End Users Notice.
+*/
+
+#ifndef NVTOOLSEXT_OPENCL_H_
+#define NVTOOLSEXT_OPENCL_H_
+
+#include <CL/cl.h>
+
+#include "nvToolsExt.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/* ========================================================================= */
+/** \name Functions for OpenCL Resource Naming
+ */
+/** \addtogroup RESOURCE_NAMING
+ * \section RESOURCE_NAMING_OPENCL OpenCL Resource Naming
+ *
+ * This section covers the API functions that allow to annotate OpenCL resources
+ * with user-provided names.
+ *
+ * @{
+ */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL device.
+ *
+ * Allows to associate an OpenCL device with a user-provided name.
+ *
+ * \param device - The handle of the OpenCL device to name.
+ * \param name   - The name of the OpenCL device.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceA(cl_device_id device, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceW(cl_device_id device, const wchar_t* name);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL context.
+ *
+ * Allows to associate an OpenCL context with a user-provided name.
+ *
+ * \param context - The handle of the OpenCL context to name.
+ * \param name    - The name of the OpenCL context.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClContextA(cl_context context, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClContextW(cl_context context, const wchar_t* name);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL command queue.
+ *
+ * Allows to associate an OpenCL command queue with a user-provided name.
+ *
+ * \param command_queue - The handle of the OpenCL command queue to name.
+ * \param name          - The name of the OpenCL command queue.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueA(cl_command_queue command_queue, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueW(cl_command_queue command_queue, const wchar_t* name);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL memory object.
+ *
+ * Allows to associate an OpenCL memory object with a user-provided name.
+ *
+ * \param memobj - The handle of the OpenCL memory object to name.
+ * \param name   - The name of the OpenCL memory object.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectA(cl_mem memobj, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectW(cl_mem memobj, const wchar_t* name);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL sampler.
+ *
+ * Allows to associate an OpenCL sampler with a user-provided name.
+ *
+ * \param sampler - The handle of the OpenCL sampler to name.
+ * \param name    - The name of the OpenCL sampler.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerA(cl_sampler sampler, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerW(cl_sampler sampler, const wchar_t* name);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL program.
+ *
+ * Allows to associate an OpenCL program with a user-provided name.
+ *
+ * \param program - The handle of the OpenCL program to name.
+ * \param name    - The name of the OpenCL program.
+ *
+ * \code
+ * cpProgram = clCreateProgramWithSource(cxGPUContext, 1,
+ *     (const char **) &cSourceCL, &program_length, &ciErrNum);
+ * shrCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+ * nvtxNameClProgram(cpProgram, L"PROGRAM_NAME");
+ * \endcode
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClProgramA(cl_program program, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClProgramW(cl_program program, const wchar_t* name);
+/** @} */
+
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL event.
+ *
+ * Allows to associate an OpenCL event with a user-provided name.
+ *
+ * \param evnt - The handle of the OpenCL event to name.
+ * \param name - The name of the OpenCL event.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClEventA(cl_event evnt, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClEventW(cl_event evnt, const wchar_t* name);
+/** @} */
+
+/** @} */ /* END RESOURCE_NAMING */
+
+/* ========================================================================= */
+#ifdef UNICODE
+  #define nvtxNameClDevice        nvtxNameClDeviceW
+  #define nvtxNameClContext       nvtxNameClContextW
+  #define nvtxNameClCommandQueue  nvtxNameClCommandQueueW
+  #define nvtxNameClMemObject     nvtxNameClMemObjectW
+  #define nvtxNameClSampler       nvtxNameClSamplerW
+  #define nvtxNameClProgram       nvtxNameClProgramW
+  #define nvtxNameClEvent         nvtxNameClEventW
+#else
+  #define nvtxNameClDevice        nvtxNameClDeviceA
+  #define nvtxNameClContext       nvtxNameClContextA
+  #define nvtxNameClCommandQueue  nvtxNameClCommandQueueA
+  #define nvtxNameClMemObject     nvtxNameClMemObjectA
+  #define nvtxNameClSampler       nvtxNameClSamplerA
+  #define nvtxNameClProgram       nvtxNameClProgramA
+  #define nvtxNameClEvent         nvtxNameClEventA
+#endif
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* NVTOOLSEXT_OPENCL_H_ */
diff --git a/src/raytraceKernel.cu b/src/raytraceKernel.cu
index 87a65a6..9ff7cf7 100755
--- a/src/raytraceKernel.cu
+++ b/src/raytraceKernel.cu
@@ -17,211 +17,522 @@
 #include "glm/glm.hpp"
 
 void checkCUDAError(const char *msg) {
-  cudaError_t err = cudaGetLastError();
-  if( cudaSuccess != err) {
-    fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) ); 
-    exit(EXIT_FAILURE); 
-  }
+	cudaError_t err = cudaGetLastError();
+	if( cudaSuccess != err) {
+		fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) ); 
+		exit(EXIT_FAILURE); 
+	}
 } 
 
 //LOOK: This function demonstrates how to use thrust for random number generation on the GPU!
 //Function that generates static.
 __host__ __device__ glm::vec3 generateRandomNumberFromThread(glm::vec2 resolution, float time, int x, int y){
-  int index = x + (y * resolution.x);
-   
-  thrust::default_random_engine rng(hash(index*time));
-  thrust::uniform_real_distribution<float> u01(0,1);
+	int index = x + (y * resolution.x);
 
-  return glm::vec3((float) u01(rng), (float) u01(rng), (float) u01(rng));
+	thrust::default_random_engine rng(hash(index*time));
+	thrust::uniform_real_distribution<float> u01(0,1);
+
+	return glm::vec3((float) u01(rng), (float) u01(rng), (float) u01(rng));
+}
+
+//Function that does the initial raycast from the camera given a float defined pixel. Allows pixels to be defined with subpixel resolution easily.
+//20% faster than provided code.
+__host__ __device__ ray raycastFromCamera(glm::vec2 resolution, float x, float y, glm::vec3 eye, glm::vec3 view, glm::vec3 up, glm::vec2 fov){
+
+	ray r;
+	r.origin = eye;
+	glm::vec3 right = glm::cross(view, up);
+
+	//float d = 1.0f; use a viewing plane of 1 distance 
+	glm::vec3 pixel_location = /* d* */(view + (2*x/resolution.x-1)*right*glm::tan(glm::radians(fov.x)) 
+		- (2*y/resolution.y-1)*up*glm::tan(glm::radians(fov.y)));
+
+	r.direction = glm::normalize(pixel_location);
+
+	return r;
+
+}
+
+
+//Scales the entire image by a float scale factor. Makes averaging trivial
+__global__ void scaleImageIntensity(glm::vec2 resolution, glm::vec3* image, float sf)
+{
+	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+	int index = x + (y * resolution.x);
+
+	if(x<resolution.x && y<resolution.y){
+		image[index] = sf*image[index];
+	}
+}
+
+//Takes each ray's pixel assignment and casts a randomized ray through the pixel
+__global__ void raycastFromCameraKernel(int seed, int frame, cameraData cam, renderOptions rconfig, rayState* cudaraypool, int rayPoolSize)
+{	
+
+	int blockId   = blockIdx.y * gridDim.x + blockIdx.x;			 	
+	int rIndex = blockId * blockDim.x + threadIdx.x; 
+	if(rIndex < rayPoolSize){
+		//read from global mem
+		rayState rstate = cudaraypool[rIndex];
+		int pixelIndex = rstate.index;
+		if(pixelIndex >= 0){
+			int x = pixelIndex % int(cam.resolution.x);
+			int y = (pixelIndex - x)/int(cam.resolution.x);
+
+			//Reset other fields
+			rstate.T = glm::vec3(1,1,1);
+			rstate.matIndex = -1;
+			if(rconfig.antialiasing){
+				thrust::default_random_engine rng(hash(seed*rIndex));//TODO: Improve randomness
+				thrust::uniform_real_distribution<float> u01(-0.5,0.5);
+				rstate.r =raycastFromCamera(cam.resolution, x+u01(rng), y+u01(rng), cam.position, cam.view, cam.up, cam.fov);
+			}else{
+				rstate.r =raycastFromCamera(cam.resolution, x, y, cam.position, cam.view, cam.up, cam.fov);
+			}
+			rstate.bounceType = PRIMARY;
+			//write back to global mem
+			cudaraypool[rIndex] = rstate;
+
+		}
+	}
+
+}
+
+//Takes the number of rays requested by each pixel from the pool and allocates them stocastically from a single random number
+//xi1 is a uniformly distributed random number from 0 to 1
+__global__ void allocateRayPool(float xi1, renderOptions rconfig, cameraData cam, glm::vec3* cudaimage, rayState* cudaraypool, int numRays)
+{
+	//1D blocks and 2D grid
+
+	int blockId   = blockIdx.y * gridDim.x + blockIdx.x;			 	
+	int rIndex = blockId * blockDim.x + threadIdx.x; 
+
+	if(rIndex < numRays){//Thread index range check
+
+		int numPixels = cam.resolution.x*cam.resolution.y;
+
+		//Allocate all rays stochastically
+		if(rconfig.stocasticRayAssignment){
+			float P = float(numPixels)/numRays;//compute stochastic interval
+			int start =  floor(xi1*numPixels);
+			cudaraypool[rIndex].index = ((int)(start + P*rIndex) % numPixels);
+		}else{
+			if(rIndex < numPixels)
+				cudaraypool[rIndex].index = rIndex;
+			else
+				cudaraypool[rIndex].index = -1;
+		}
+	}
 }
 
-//Kernel that does the initial raycast from the camera.
-__host__ __device__ ray raycastFromCameraKernel(glm::vec2 resolution, float time, int x, int y, glm::vec3 eye, glm::vec3 view, glm::vec3 up, glm::vec2 fov){
-   
-  int index = x + (y * resolution.x);
-   
-  thrust::default_random_engine rng(hash(index*time));
-  thrust::uniform_real_distribution<float> u01(0,1);
-  
-  //standard camera raycast stuff
-  glm::vec3 E = eye;
-  glm::vec3 C = view;
-  glm::vec3 U = up;
-  float fovx = fov.x;
-  float fovy = fov.y;
-  
-  float CD = glm::length(C);
-  
-  glm::vec3 A = glm::cross(C, U);
-  glm::vec3 B = glm::cross(A, C);
-  glm::vec3 M = E+C;
-  glm::vec3 H = (A*float(CD*tan(fovx*(PI/180))))/float(glm::length(A));
-  glm::vec3 V = (B*float(CD*tan(-fovy*(PI/180))))/float(glm::length(B));
-  
-  float sx = (x)/(resolution.x-1);
-  float sy = (y)/(resolution.y-1);
-  
-  glm::vec3 P = M + (((2*sx)-1)*H) + (((2*sy)-1)*V);
-  glm::vec3 PmE = P-E;
-  glm::vec3 R = E + (float(200)*(PmE))/float(glm::length(PmE));
-  
-  glm::vec3 direction = glm::normalize(R);
-  //major performance cliff at this point, TODO: find out why!
-  ray r;
-  r.origin = eye;
-  r.direction = direction;
-  return r;
+__global__ void displayRayCounts(cameraData cam, renderOptions rconfig, glm::vec3* cudaimage, rayState* cudaraypool, int numRays, float maxScale)
+{
+	int blockId   = blockIdx.y * gridDim.x + blockIdx.x;			 	
+	int rIndex = blockId * blockDim.x + threadIdx.x; 
+
+	if(rIndex < numRays){//Thread index range check
+		int pixelIndex = cudaraypool[rIndex].index;
+		if(pixelIndex >= 0)
+		{
+			float scale = clamp(1.0f/maxScale, 0.0f, 1.0f);
+			cudaimage[pixelIndex] += scale*glm::vec3(1,1,1);
+		}
+	}
+
 }
 
 //Kernel that blacks out a given image buffer
 __global__ void clearImage(glm::vec2 resolution, glm::vec3* image){
-    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
-    int y = (blockIdx.y * blockDim.y) + threadIdx.y;
-    int index = x + (y * resolution.x);
-    if(x<=resolution.x && y<=resolution.y){
-      image[index] = glm::vec3(0,0,0);
-    }
+	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+	int index = x + (y * resolution.x);
+	if(x<=resolution.x && y<=resolution.y){
+		image[index] = glm::vec3(0,0,0);
+	}
 }
 
+
 //Kernel that writes the image to the OpenGL PBO directly. 
-__global__ void sendImageToPBO(uchar4* PBOpos, glm::vec2 resolution, glm::vec3* image){
-  
-  int x = (blockIdx.x * blockDim.x) + threadIdx.x;
-  int y = (blockIdx.y * blockDim.y) + threadIdx.y;
-  int index = x + (y * resolution.x);
-  
-  if(x<=resolution.x && y<=resolution.y){
-
-      glm::vec3 color;      
-      color.x = image[index].x*255.0;
-      color.y = image[index].y*255.0;
-      color.z = image[index].z*255.0;
-
-      if(color.x>255){
-        color.x = 255;
-      }
-
-      if(color.y>255){
-        color.y = 255;
-      }
-
-      if(color.z>255){
-        color.z = 255;
-      }
-      
-      // Each thread writes one pixel location in the texture (textel)
-      PBOpos[index].w = 0;
-      PBOpos[index].x = color.x;     
-      PBOpos[index].y = color.y;
-      PBOpos[index].z = color.z;
-  }
+__global__ void sendImageToPBO(uchar4* PBOpos, glm::vec2 resolution, glm::vec3* image, float scaleFactor){
+
+	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+	int index = x + (y * resolution.x);
+
+	if(x<=resolution.x && y<=resolution.y){
+
+		glm::vec3 color;      
+		color.x = image[index].x*255.0*scaleFactor;
+		color.y = image[index].y*255.0*scaleFactor;
+		color.z = image[index].z*255.0*scaleFactor;
+
+		//Clamp
+		if(color.x>255){
+			color.x = 255;
+		}
+
+		if(color.y>255){
+			color.y = 255;
+		}
+
+		if(color.z>255){
+			color.z = 255;
+		}
+
+		// Each thread writes one pixel location in the texture (textel)
+		PBOpos[index].w = 0;
+		PBOpos[index].x = color.x;     
+		PBOpos[index].y = color.y;
+		PBOpos[index].z = color.z;
+	}
+}
+
+__global__ void traceRayFirstHit(cameraData cam, renderOptions rconfig, float time, int bounce, glm::vec3* colors, 
+								 rayState* raypool, int numRays, staticGeom* geoms, int numberOfGeoms, material* materials, int numberOfMaterials)
+{
+	//Compute ray index
+	int blockId   = blockIdx.y * gridDim.x + blockIdx.x;			 	
+	int rIndex = blockId * blockDim.x + threadIdx.x; 
+
+
+	//Pixel index of -1 indicates the ray's contribution has been recorded and is no longer in flight
+	if(rIndex < numRays)
+	{
+		//Thread has a ray, check if ray has a pixel
+		int pixelIndex = raypool[rIndex].index;
+		if(pixelIndex >= 0 && pixelIndex < (int)cam.resolution.x*(int)cam.resolution.y)
+		{
+			//valid pixel index
+			ray r = raypool[rIndex].r;
+
+			float dist;
+			glm::vec3 intersectionPoint;
+			glm::vec3 normal;
+			int ind = firstIntersect(geoms, numberOfGeoms, r, intersectionPoint, normal, dist);
+			if(rconfig.mode == FIRST_HIT_DEBUG){
+				if(ind >= 0)
+					colors[pixelIndex] += materials[geoms[ind].materialid].color;
+			}else if(rconfig.mode == NORMAL_DEBUG){
+				colors[pixelIndex] += glm::abs(normal);
+			}
+		}	
+	}
 }
 
+
 //TODO: IMPLEMENT THIS FUNCTION
 //Core raytracer kernel
-__global__ void raytraceRay(glm::vec2 resolution, float time, float bounce, cameraData cam, int rayDepth, glm::vec3* colors, 
-                            staticGeom* geoms, int numberOfGeoms, material* materials, int numberOfMaterials){
-
-  int x = (blockIdx.x * blockDim.x) + threadIdx.x;
-  int y = (blockIdx.y * blockDim.y) + threadIdx.y;
-  int index = x + (y * resolution.x);
-
-  ray r = raycastFromCameraKernel(resolution, time, x, y, cam.position, cam.view, cam.up, cam.fov);
-
-  if((x<=resolution.x && y<=resolution.y)){
-
-    float MAX_DEPTH = 100000000000000000;
-    float depth = MAX_DEPTH;
-
-    for(int i=0; i<numberOfGeoms; i++){
-        glm::vec3 intersectionPoint;
-        glm::vec3 intersectionNormal;
-       if(geoms[i].type==SPHERE){
-           depth = sphereIntersectionTest(geoms[i], r, intersectionPoint, intersectionNormal);
-        }else if(geoms[i].type==CUBE){
-            depth = boxIntersectionTest(geoms[i], r, intersectionPoint, intersectionNormal);
-        }else if(geoms[i].type==MESH){
-            //triangle tests go here
-        }else{
-            //lol?
-        }
-        if(depth<MAX_DEPTH && depth>-EPSILON){
-          MAX_DEPTH = depth;
-          colors[index] = materials[geoms[i].materialid].color;
-        }
-    }
-
-
-
-    //colors[index] = generateRandomNumberFromThread(resolution, time, x, y);
-   }
+__global__ void traceRay(cameraData cam, renderOptions rconfig, float time, int bounce, glm::vec3* colors, 
+						 rayState* raypool, int numRays, staticGeom* geoms, int numberOfGeoms, material* materials, int numberOfMaterials)
+{
+	//Compute ray index
+	int blockId   = blockIdx.y * gridDim.x + blockIdx.x;			 	
+	int rIndex = blockId * blockDim.x + threadIdx.x; 
+
+
+	//Pixel index of -1 indicates the ray's contribution has been recorded and is no longer in flight
+	if(rIndex < numRays)
+	{
+		//Thread has a ray, check if ray has a pixel
+		int pixelIndex = raypool[rIndex].index;
+		if(pixelIndex >= 0 && pixelIndex < (int)cam.resolution.x*(int)cam.resolution.y)
+		{
+			//valid pixel index
+			rayState rstate = raypool[rIndex];
+			//Check if ray is still useful
+			if(rstate.T.x > rconfig.minT || rstate.T.y > rconfig.minT || rstate.T.z > rconfig.minT)
+			{
+				//ray still has transmitance worth considering
+
+				//Find first collision
+				float dist;
+				glm::vec3 intersectionPoint;
+				glm::vec3 normal;
+				int ind = firstIntersect(geoms, numberOfGeoms, rstate.r, intersectionPoint, normal, dist);
+				if(rstate.bounceType == SHADOW){
+					if(ind == rconfig.globalLightGeomInd)
+					{
+						colors[pixelIndex] += rstate.T;
+					}
+					rstate.index = -1;
+				}else if(ind >= 0){
+					//we hit something!
+
+					//calculate transmission through material
+					glm::vec3 absorbtionCoeff;
+					if(rstate.matIndex >= 0 )
+						absorbtionCoeff = materials[rstate.matIndex].absorptionCoefficient;
+					else
+						absorbtionCoeff = rconfig.airAbsorbtion;
+
+					rstate.T *= calculateTransmission(absorbtionCoeff, dist);
+
+					//Transmission computed, now let's check on what we hit. This is where code will diverge quite a bit
+
+
+					//Check if it's a light
+					material m = materials[geoms[ind].materialid];
+					if(m.emittance > 0)
+					{
+						//hit a light source. Light it up.
+						if(rconfig.mode == TRACEDEPTH_DEBUG){
+							colors[pixelIndex] += bounce/float(rconfig.traceDepth)*glm::vec3(1,1,1);
+							rstate.index = -1;//retire ray
+						}else if(rconfig.mode == PATHTRACE){
+							colors[pixelIndex] += rstate.T*m.emittance*m.color;
+							rstate.index = -1;//retire ray
+						}
+					}else{
+						//Not a light
+						if(bounce < rconfig.traceDepth - 1){
+							//if we have more bounces to do, Bounce ray. 
+
+							//TODO: Improve randomness with point sets?
+							thrust::default_random_engine rng(hash(time*rIndex));
+							thrust::uniform_real_distribution<float> u01(0,1);
+							bounceRay(rstate, rconfig, intersectionPoint, normal, materials, geoms[ind].materialid, u01(rng), u01(rng), u01(rng));
+
+						}else{
+							//This was the last bounce. 
+							if(rconfig.mode == TRACEDEPTH_DEBUG)
+							{
+								colors[pixelIndex] += bounce/float(rconfig.traceDepth)*glm::vec3(1,1,1);
+								rstate.index = -1;//retire ray
+							}else if(rconfig.mode == PATHTRACE)
+							{
+								//This photon didn't come from anything
+								rstate.index = -1;
+
+							}
+						}
+					}
+				}else{
+					//How could you miss it was right in front of you!!
+					//retire ray, add global illumination compnent
+
+					if(rconfig.mode == TRACEDEPTH_DEBUG){
+						colors[pixelIndex] += bounce/float(rconfig.traceDepth)*glm::vec3(1,1,1);
+						rstate.index = -1;//retire ray
+					}else 
+						if(rconfig.mode == PATHTRACE){
+							//Compute global illumination component, we've hit the sky
+							staticGeom globalLight = geoms[rconfig.globalLightGeomInd];
+							material globalLightMat = materials[globalLight.materialid];
+							if(rstate.bounceType == DIFFUSE){
+								//Diffuse shading is special case, we may want to include shadows in our shading.
+								if(rconfig.globalLightGeomInd > -1)//if we have a global light
+								{
+									glm::vec3 globalLightPos;
+									if(globalLight.type == SPHERE)
+									{
+										globalLightPos = getRandomPointOnSphere(globalLight, time*(bounce+1)+rIndex);
+									}else{
+										globalLightPos = getRandomPointOnCube(globalLight, time*(bounce+1)+rIndex);
+									}
+									glm::vec3 globalLightDirection = glm::normalize(globalLightPos-rstate.r.origin);
+
+									float globalLightDot = clamp(glm::dot(rstate.r.direction, globalLightDirection),0.0f,1.0f);
+
+									if(rconfig.globalShadows){
+										//Cast shadow ray
+										rstate.T *= globalLightMat.absorptionCoefficient*globalLightMat.emittance*globalLightDot;
+										rstate.r.direction =  globalLightDirection;
+										rstate.bounceType = SHADOW;
+									}else{
+										//Approximate lambertian shading
+										colors[pixelIndex] += rstate.T*globalLightMat.absorptionCoefficient*globalLightMat.emittance*globalLightDot;
+										rstate.index = -1;//retire ray
+									}
+								}
+							}else{
+								//Primary or specular reflection, just display sky color
+								colors[pixelIndex] += rstate.T*globalLightMat.specularColor;
+								rstate.index = -1;//retire ray
+							}
+						}
+				}
+			}else{
+				//Ray no longer transmits any useful info
+				//Retire it
+				rstate.index = -1;
+			}
+			//Write back
+			raypool[rIndex] = rstate;
+		}	
+	}
+}
+
+
+
+
+//Compacts the ray pool by removing all dead rays
+__host__ int raypoolCompaction(rayState** cudaraypool, int rayPoolSize)
+{
+	//Temporary variable to point to new pool
+	rayState* compactPool;
+	RayAlive op;
+	int newCount = streamCompaction(*cudaraypool, &compactPool, rayPoolSize, op);
+	cudaFree(*cudaraypool);
+	*cudaraypool = compactPool;
+
+	return newCount;
 }
 
 
 //TODO: FINISH THIS FUNCTION
 // Wrapper for the __global__ call that sets up the kernel calls and does a ton of memory management
-void cudaRaytraceCore(uchar4* PBOpos, camera* renderCam, int frame, int iterations, material* materials, int numberOfMaterials, geom* geoms, int numberOfGeoms){
-  
-  int traceDepth = 1; //determines how many bounces the raytracer traces
-
-  // set up crucial magic
-  int tileSize = 8;
-  dim3 threadsPerBlock(tileSize, tileSize);
-  dim3 fullBlocksPerGrid((int)ceil(float(renderCam->resolution.x)/float(tileSize)), (int)ceil(float(renderCam->resolution.y)/float(tileSize)));
-  
-  //send image to GPU
-  glm::vec3* cudaimage = NULL;
-  cudaMalloc((void**)&cudaimage, (int)renderCam->resolution.x*(int)renderCam->resolution.y*sizeof(glm::vec3));
-  cudaMemcpy( cudaimage, renderCam->image, (int)renderCam->resolution.x*(int)renderCam->resolution.y*sizeof(glm::vec3), cudaMemcpyHostToDevice);
-  
-  //package geometry and materials and sent to GPU
-  staticGeom* geomList = new staticGeom[numberOfGeoms];
-  for(int i=0; i<numberOfGeoms; i++){
-    staticGeom newStaticGeom;
-    newStaticGeom.type = geoms[i].type;
-    newStaticGeom.materialid = geoms[i].materialid;
-    newStaticGeom.translation = geoms[i].translations[frame];
-    newStaticGeom.rotation = geoms[i].rotations[frame];
-    newStaticGeom.scale = geoms[i].scales[frame];
-    newStaticGeom.transform = geoms[i].transforms[frame];
-    newStaticGeom.inverseTransform = geoms[i].inverseTransforms[frame];
-    geomList[i] = newStaticGeom;
-  }
-  
-  staticGeom* cudageoms = NULL;
-  cudaMalloc((void**)&cudageoms, numberOfGeoms*sizeof(staticGeom));
-  cudaMemcpy( cudageoms, geomList, numberOfGeoms*sizeof(staticGeom), cudaMemcpyHostToDevice);
-  
-  material* cudamaterials = NULL;
-  cudaMalloc((void**)&cudamaterials, numberOfMaterials*sizeof(material));
-  cudaMemcpy( cudamaterials, materials, numberOfMaterials*sizeof(material), cudaMemcpyHostToDevice);
-
-  //package camera
-  cameraData cam;
-  cam.resolution = renderCam->resolution;
-  cam.position = renderCam->positions[frame];
-  cam.view = renderCam->views[frame];
-  cam.up = renderCam->ups[frame];
-  cam.fov = renderCam->fov;
-
-  //kernel launches
-  for(int bounce = 1; bounce <= 1; ++bounce)
-  {
-  raytraceRay<<<fullBlocksPerGrid, threadsPerBlock>>>(renderCam->resolution, (float)iterations, (float)bounce, cam, traceDepth, cudaimage, cudageoms, numberOfGeoms, cudamaterials, numberOfMaterials);
-  }
-  sendImageToPBO<<<fullBlocksPerGrid, threadsPerBlock>>>(PBOpos, renderCam->resolution, cudaimage);
-
-  //retrieve image from GPU
-  cudaMemcpy( renderCam->image, cudaimage, (int)renderCam->resolution.x*(int)renderCam->resolution.y*sizeof(glm::vec3), cudaMemcpyDeviceToHost);
-
-  //free up stuff, or else we'll leak memory like a madman
-  cudaFree( cudaimage );
-  cudaFree( cudageoms );
-  cudaFree( cudamaterials );
-  delete [] geomList;
-
-  // make certain the kernel has completed 
-  cudaThreadSynchronize();
-
-  checkCUDAError("Kernel failed!");
+void cudaRaytraceCore(uchar4* PBOpos, camera* renderCam,  renderOptions* rconfig, int frame, int iterations, int frameFilterCounter, material* materials, int numberOfMaterials, geom* geoms, int numberOfGeoms){
+
+	int traceDepth = rconfig->traceDepth; //determines how many bounces the raytracer traces
+	int numPixels = renderCam->resolution.x*renderCam->resolution.y;
+	int rayPoolSize = (int) ceil(float(numPixels)*rconfig->rayPoolSize);
+
+	// set up crucial magic
+	int tileSize = 8;
+	dim3 threadsPerBlockByPixel(tileSize, tileSize);
+	dim3 fullBlocksPerGridByPixel((int)ceil(float(renderCam->resolution.x)/float(tileSize)), (int)ceil(float(renderCam->resolution.y)/float(tileSize)));
+
+	// Set up a 2D grid
+	// Fill up rows before adding more
+	//TODO: Improve resource allocation. Slipping over once will create a lot of wasted blocks
+	int blockSize = 64;
+	dim3 threadsPerBlockByRay(blockSize);
+	int blockCount = (int)ceil(float(rayPoolSize)/float(blockSize));
+
+	dim3 fullBlocksPerGridByRay;
+	int maxGridX = 65535;//TODO: get this dynamically
+	if(blockCount > maxGridX){
+		fullBlocksPerGridByRay = dim3(maxGridX, (int)ceil( blockCount / float(maxGridX)));
+	}else{
+		fullBlocksPerGridByRay = dim3(blockCount);
+	}
+
+	//send image to GPU
+	glm::vec3* cudaimage = NULL;
+	cudaMalloc((void**)&cudaimage, numPixels*sizeof(glm::vec3));
+	cudaMemcpy( cudaimage, renderCam->image, numPixels*sizeof(glm::vec3), cudaMemcpyHostToDevice);
+
+	//package geometry and materials and sent to GPU
+	staticGeom* geomList = new staticGeom[numberOfGeoms];
+	for(int i=0; i<numberOfGeoms; i++){
+		staticGeom newStaticGeom;
+		newStaticGeom.type = geoms[i].type;
+		newStaticGeom.materialid = geoms[i].materialid;
+		newStaticGeom.translation = geoms[i].translations[frame];
+		newStaticGeom.rotation = geoms[i].rotations[frame];
+		newStaticGeom.scale = geoms[i].scales[frame];
+		newStaticGeom.transform = geoms[i].transforms[frame];
+		newStaticGeom.inverseTransform = geoms[i].inverseTransforms[frame];
+		geomList[i] = newStaticGeom;
+	}
+
+	///Allocations
+	staticGeom* cudageoms = NULL;
+	cudaMalloc((void**)&cudageoms, numberOfGeoms*sizeof(staticGeom));
+	cudaMemcpy( cudageoms, geomList, numberOfGeoms*sizeof(staticGeom), cudaMemcpyHostToDevice);
+
+	material* cudamaterials = NULL;
+	cudaMalloc((void**)&cudamaterials, numberOfMaterials*sizeof(material));
+	cudaMemcpy( cudamaterials, materials, numberOfMaterials*sizeof(material), cudaMemcpyHostToDevice);
+
+	rayState* cudaraypool = NULL;
+	cudaMalloc((void**)&cudaraypool, rayPoolSize*sizeof(rayState));
+
+
+	//package camera
+	cameraData cam;
+	cam.resolution = renderCam->resolution;
+	cam.position = renderCam->positions[frame];
+	cam.view = renderCam->views[frame];
+	cam.up = renderCam->ups[frame];
+	cam.fov = renderCam->fov;
+
+	///Prep image
+	if(!rconfig->frameFiltering || frameFilterCounter <= 1)
+	{
+		clearImage<<<fullBlocksPerGridByPixel, threadsPerBlockByPixel>>>(renderCam->resolution, cudaimage);
+		frameFilterCounter = 1;
+
+	}
+	//else{
+	//	scaleImageIntensity<<<fullBlocksPerGridByPixel, threadsPerBlockByPixel>>>(renderCam->resolution, cudaimage, (float)(frameFilterCounter-1));
+	//}
+
+
+	//Figure out which rays should go to which pixels.
+	thrust::default_random_engine rng(hash(iterations*frameFilterCounter+iterations));
+	thrust::uniform_real_distribution<float> u01(0,1);
+	allocateRayPool<<<fullBlocksPerGridByRay, threadsPerBlockByRay>>>(u01(rng), *rconfig, cam, cudaimage, cudaraypool, rayPoolSize);
+
+	switch(rconfig->mode)
+	{
+	case TRACEDEPTH_DEBUG:
+	case PATHTRACE:
+		raycastFromCameraKernel<<<fullBlocksPerGridByRay, threadsPerBlockByRay>>>(iterations, frame, cam, *rconfig, cudaraypool, rayPoolSize);
+
+		for(int bounce = 0; bounce < traceDepth && rayPoolSize > 0; bounce++)
+		{
+			traceRay<<<fullBlocksPerGridByRay, threadsPerBlockByRay>>>(cam, *rconfig, iterations, bounce, cudaimage, 
+				cudaraypool, rayPoolSize, cudageoms, numberOfGeoms, cudamaterials, numberOfMaterials);
+			if(rconfig->streamCompaction)
+			{
+				//printf("Raypool size%d\n", rayPoolSize);
+				rayPoolSize = raypoolCompaction(&cudaraypool, rayPoolSize);
+
+
+				blockCount = (int)ceil(float(rayPoolSize)/float(blockSize));
+
+				dim3 fullBlocksPerGridByRay;
+				if(blockCount > maxGridX){
+					fullBlocksPerGridByRay = dim3(maxGridX, (int)ceil( blockCount / float(maxGridX)));
+				}else{
+					fullBlocksPerGridByRay = dim3(blockCount);
+				}
+			}
+		}
+
+		break;
+	case RAYCOUNT_DEBUG:
+		displayRayCounts<<<fullBlocksPerGridByRay, threadsPerBlockByRay>>>(cam, *rconfig, cudaimage, cudaraypool, rayPoolSize,ceil(float(rayPoolSize)/numPixels));
+		break;
+
+	case NORMAL_DEBUG:
+	case FIRST_HIT_DEBUG:
+		raycastFromCameraKernel<<<fullBlocksPerGridByRay, threadsPerBlockByRay>>>(iterations, frame, cam, *rconfig, cudaraypool, rayPoolSize);
+
+		traceRayFirstHit<<<fullBlocksPerGridByRay, threadsPerBlockByRay>>>(cam, *rconfig, iterations, 0, cudaimage, 
+			cudaraypool, rayPoolSize, cudageoms, numberOfGeoms, cudamaterials, numberOfMaterials);
+
+		break;
+	}
+
+
+	//if(rconfig->frameFiltering)
+	//{
+	//	scaleImageIntensity<<<fullBlocksPerGridByPixel, threadsPerBlockByPixel>>>(renderCam->resolution, cudaimage, 1.0f/(frameFilterCounter));
+	//}
+
+
+	//retrieve image from GPU before drawing overlays and writing to screen
+	cudaMemcpy( renderCam->image, cudaimage, numPixels*sizeof(glm::vec3), cudaMemcpyDeviceToHost);
+
+	//TODO: Draw any debug overlays here
+
+
+
+	//Draw to screen
+	sendImageToPBO<<<fullBlocksPerGridByPixel, threadsPerBlockByPixel>>>(PBOpos, renderCam->resolution, cudaimage, 1.0f/float(frameFilterCounter));
+
+
+	//free up stuff, or else we'll leak memory like a madman
+	cudaFree( cudaimage );
+	cudaFree( cudageoms );
+	cudaFree( cudamaterials );
+	cudaFree( cudaraypool );
+	delete [] geomList;
+
+	// make certain the kernel has completed 
+	cudaThreadSynchronize();
+	checkCUDAError("Kernel failed!");
 }
diff --git a/src/raytraceKernel.h b/src/raytraceKernel.h
index 5fcf5a3..61ce659 100755
--- a/src/raytraceKernel.h
+++ b/src/raytraceKernel.h
@@ -13,6 +13,8 @@
 #include <cuda.h>
 #include <cmath>
 #include "sceneStructs.h"
+#include "cudaAlgorithms.h"
+
 
 #if CUDA_VERSION >= 5000
     #include <helper_math.h>
@@ -20,6 +22,11 @@
     #include <cutil_math.h>
 #endif
 
-void cudaRaytraceCore(uchar4* pos, camera* renderCam, int frame, int iterations, material* materials, int numberOfMaterials, geom* geoms, int numberOfGeoms);
+#ifdef CUDA_PROFILING
+	#include "cuda_profiler_api."h
+#endif
+
+
+void cudaRaytraceCore(uchar4* PBOpos, camera* renderCam,  renderOptions* rconfig, int frame, int iterations, int frameFilterCounter, material* materials, int numberOfMaterials, geom* geoms, int numberOfGeoms);
 
 #endif
diff --git a/src/sceneStructs.h b/src/sceneStructs.h
index b10f1cf..6a106f7 100755
--- a/src/sceneStructs.h
+++ b/src/sceneStructs.h
@@ -12,12 +12,23 @@
 #include <string>
 
 enum GEOMTYPE{ SPHERE, CUBE, MESH };
+enum RENDERMODE{PATHTRACE, RAYCOUNT_DEBUG, TRACEDEPTH_DEBUG, FIRST_HIT_DEBUG, NORMAL_DEBUG};
+enum BounceType{PRIMARY, DIFFUSE, REFLECT, TRANSMIT, SHADOW};
 
 struct ray {
 	glm::vec3 origin;
 	glm::vec3 direction;
 };
 
+
+struct rayState{
+	ray r;//Current ray
+	int index;//pixel to contribute to
+	glm::vec3 T;//accumulated light attenuation by color
+	int matIndex;//Index of transmission medium material. -1 if in free space.
+	BounceType bounceType;//Type of last bounce. 
+};
+
 struct geom {
 	enum GEOMTYPE type;
 	int materialid;
@@ -73,4 +84,24 @@ struct material{
 	float emittance;
 };
 
+
+struct renderOptions{
+	enum RENDERMODE mode;
+	int traceDepth;
+	float rayPoolSize;
+	bool stocasticRayAssignment;
+
+	glm::vec3 airAbsorbtion;
+	float airIOR;
+
+	int globalLightGeomInd;
+
+	bool streamCompaction;
+	bool globalShadows;
+	bool antialiasing;
+	bool frameFiltering;
+
+	float minT;//minimum ray transmission function. Lets rays die out if they won't contribute anything.
+};
+
 #endif //CUDASTRUCTS_H
diff --git a/src/utilities.cpp b/src/utilities.cpp
index 3fd4b73..e3ca7d1 100755
--- a/src/utilities.cpp
+++ b/src/utilities.cpp
@@ -53,7 +53,11 @@ glm::vec3 utilityCore::clampRGB(glm::vec3 color){
 }
 
 bool utilityCore::epsilonCheck(float a, float b){
-    if(fabs(fabs(a)-fabs(b))<EPSILON){
+	return epsilonCheck(a,b,EPSILON);
+}
+
+bool utilityCore::epsilonCheck(float a, float b, float ep){
+    if(fabs(fabs(a)-fabs(b))<ep){
         return true;
     }else{
         return false;
@@ -134,6 +138,33 @@ std::istream& utilityCore::safeGetline(std::istream& is, std::string& t)
         }
     }
 }
+
+
+void utilityCore::printDevProp(cudaDeviceProp devProp)
+{
+    printf("Major revision number:         %d\n",  devProp.major);
+    printf("Minor revision number:         %d\n",  devProp.minor);
+    printf("Name:                          %s\n",  devProp.name);
+    printf("Total global memory:           %u\n",  devProp.totalGlobalMem);
+    printf("Total shared memory per block: %u\n",  devProp.sharedMemPerBlock);
+    printf("Total registers per block:     %d\n",  devProp.regsPerBlock);
+    printf("Warp size:                     %d\n",  devProp.warpSize);
+    printf("Maximum memory pitch:          %u\n",  devProp.memPitch);
+    printf("Maximum threads per block:     %d\n",  devProp.maxThreadsPerBlock);
+    for (int i = 0; i < 3; ++i)
+    printf("Maximum dimension %d of block:  %d\n", i, devProp.maxThreadsDim[i]);
+    for (int i = 0; i < 3; ++i)
+    printf("Maximum dimension %d of grid:   %d\n", i, devProp.maxGridSize[i]);
+    printf("Clock rate:                    %d\n",  devProp.clockRate);
+    printf("Total constant memory:         %u\n",  devProp.totalConstMem);
+    printf("Texture alignment:             %u\n",  devProp.textureAlignment);
+    printf("Concurrent copy and execution: %s\n",  (devProp.deviceOverlap ? "Yes" : "No"));
+    printf("Number of multiprocessors:     %d\n",  devProp.multiProcessorCount);
+    printf("Kernel execution timeout:      %s\n",  (devProp.kernelExecTimeoutEnabled ? "Yes" : "No"));
+    return;
+}
+
+
 //-----------------------------
 //-------GLM Printers----------
 //-----------------------------
diff --git a/src/utilities.h b/src/utilities.h
index 84ec55f..e17120b 100755
--- a/src/utilities.h
+++ b/src/utilities.h
@@ -17,19 +17,31 @@
 #include <vector>
 #include "cudaMat4.h"
 
-#define PI                          3.1415926535897932384626422832795028841971
+
+#ifdef CUDA_PROFILING
+	#include "cuda_profiler_api.h"
+	#include "nvTools/nvToolsExt.h"
+#endif
+
+#define PI							3.1415926535897932384626422832795028841971
 #define TWO_PI                      6.2831853071795864769252867665590057683943
 #define SQRT_OF_ONE_THIRD           0.5773502691896257645091487805019574556476
-#define NATURAL_E                   2.7182818284590452353602874713526624977572
-#define EPSILON                     .000000001
-#define ZERO_ABSORPTION_EPSILON     0.00001
-#define RAY_BIAS_AMOUNT             0.0002
+#define E                           2.7182818284590452353602874713526624977572
+#define EPSILON                     0.000000001f
+#define ZERO_ABSORPTION_EPSILON     0.00001f
+#define RAY_BIAS_AMOUNT             0.005f
+
+#define MIN(a,b)	((a<b)?a:b)
+#define MAX(a,b)	((a>b)?a:b)
+#define SWAP(x,y,t)    {t=x;x=y;y=t;}
+
 
 namespace utilityCore {
     extern float clamp(float f, float min, float max);
     extern bool replaceString(std::string& str, const std::string& from, const std::string& to);
     extern glm::vec3 clampRGB(glm::vec3 color);
     extern bool epsilonCheck(float a, float b);
+    extern bool epsilonCheck(float a, float b, float ep);
     extern std::vector<std::string> tokenizeString(std::string str); 
     extern cudaMat4 glmMat4ToCudaMat4(glm::mat4 a);
     extern glm::mat4 cudaMat4ToGlmMat4(cudaMat4 a);
@@ -37,6 +49,7 @@ namespace utilityCore {
     extern void printCudaMat4(cudaMat4 m);
     extern std::string convertIntToString(int number);
     extern std::istream& safeGetline(std::istream& is, std::string& t); //Thanks to http://stackoverflow.com/a/6089413
+	extern void printDevProp(cudaDeviceProp devProp);//Thanks to http://gpucoder.livejournal.com/1064.html
 
     //-----------------------------
     //-------GLM Printers----------
diff --git a/testdata/HallOfMirrors.fig b/testdata/HallOfMirrors.fig
new file mode 100644
index 0000000..dd7c1b0
Binary files /dev/null and b/testdata/HallOfMirrors.fig differ
diff --git a/testdata/HallOfMirrors.png b/testdata/HallOfMirrors.png
new file mode 100644
index 0000000..dc01d4b
Binary files /dev/null and b/testdata/HallOfMirrors.png differ
diff --git a/testdata/importfile.m b/testdata/importfile.m
new file mode 100644
index 0000000..d9cc125
--- /dev/null
+++ b/testdata/importfile.m
@@ -0,0 +1,22 @@
+function importfile(fileToRead1)
+%IMPORTFILE(FILETOREAD1)
+%  Imports data from the specified file
+%  FILETOREAD1:  file to read
+
+%  Auto-generated by MATLAB on 03-Oct-2013 14:59:43
+
+% Import the file
+newData1 = importdata(fileToRead1);
+
+% Break the data up into a new structure with one field per column.
+colheaders = genvarname(newData1.colheaders);
+for i = 1:length(colheaders)
+    dataByColumn1.(colheaders{i}) = newData1.data(:, i);
+end
+
+% Create new variables in the base workspace from those fields.
+vars = fieldnames(dataByColumn1);
+for i = 1:length(vars)
+    assignin('base', vars{i}, dataByColumn1.(vars{i}));
+end
+
diff --git a/testdata/island.fig b/testdata/island.fig
new file mode 100644
index 0000000..8e6e265
Binary files /dev/null and b/testdata/island.fig differ
diff --git a/testdata/island.png b/testdata/island.png
new file mode 100644
index 0000000..7565f80
Binary files /dev/null and b/testdata/island.png differ
diff --git a/testdata/performance_sweep_island_long.csv b/testdata/performance_sweep_island_long.csv
new file mode 100644
index 0000000..0361597
--- /dev/null
+++ b/testdata/performance_sweep_island_long.csv
@@ -0,0 +1,101 @@
+TraceDepth,NumBounces,RunTimeMS_SC,RunTimeMS_NoSC
+0,0,7.36,7.58
+1,1,12.14,10.64
+2,1.28081,17.9,11.96
+3,1.32611,22.34,14.08
+4,2.06472,24.5,15.7
+5,2.15868,25.86,16.78
+6,2.17549,26.56,17.22
+7,2.17946,26.86,17.64
+8,2.18033,26.76,17.94
+9,2.1809,26.76,18.28
+10,2.18104,27.18,18.72
+11,2.18089,27.16,19.04
+12,2.18087,27.14,19.28
+13,2.18094,27.5,19.84
+14,2.1809,27.98,20.08
+15,2.18098,28,20.36
+16,2.18089,28.06,20.86
+17,2.18103,28.1,21.34
+18,2.18084,28.14,21.5
+19,2.18128,28.28,21.76
+20,2.18118,28.08,21.94
+21,2.18106,28.06,22.18
+22,2.18085,27.72,22.36
+23,2.18111,27.4,22.4
+24,2.18117,27.38,22.6
+25,2.18108,28.06,22.72
+26,2.18113,28.14,22.84
+27,2.18119,28.12,23.1
+28,2.18117,27.6,23.22
+29,2.18121,27.46,23.24
+30,2.18101,27.58,23.42
+31,2.18088,27.62,23.54
+32,2.18126,27.46,23.82
+33,2.18096,27.42,23.88
+34,2.1811,27.58,23.96
+35,2.18092,27.4,24.16
+36,2.18086,27.54,24.26
+37,2.1809,27.72,24.56
+38,2.18073,27.66,24.84
+39,2.18097,27.44,25.1
+40,2.18106,27.36,25.18
+41,2.18091,27.54,25.34
+42,2.1809,28.02,25.5
+43,2.18084,28.04,25.5
+44,2.18098,27.78,25.74
+45,2.18094,27.56,25.84
+46,2.1806,27.34,25.9
+47,2.1809,27.26,26.22
+48,2.18074,27.7,26.38
+49,2.18086,28.08,26.52
+50,2.18098,28.04,26.52
+51,2.18086,27.86,26.64
+52,2.18069,27.14,26.82
+53,2.18097,27.08,26.9
+54,2.18067,27.28,27.02
+55,2.18072,27.08,27.16
+56,2.18127,27.58,27.28
+57,2.18092,27.92,27.5
+58,2.18105,28,27.62
+59,2.1807,28.1,27.78
+60,2.18121,28.1,27.9
+61,2.1812,27.92,27.96
+62,2.18108,27.82,28.26
+63,2.18108,27.4,28.36
+64,2.18087,27.26,28.4
+65,2.18099,27.3,28.74
+66,2.18104,27.3,29.02
+67,2.18305,27.32,28.86
+68,2.18714,27.24,29.06
+69,2.19181,27.18,29.16
+70,2.19718,27.34,29.3
+71,2.20455,27.52,29.08
+72,2.21301,28.3,29.04
+73,2.22175,28.64,28.84
+74,2.23145,28.68,28.76
+75,2.24148,28.92,28.62
+76,2.25168,29.06,28.56
+77,2.26233,29.18,28.52
+78,2.27141,28.62,28.46
+79,2.28005,28.46,28.32
+80,2.28835,28.54,28.2
+81,2.29525,28.58,28.12
+82,2.30314,28.56,28.06
+83,2.31066,28.86,28.08
+84,2.31793,28.8,28.1
+85,2.32615,29.12,28.1
+86,2.3333,29.98,28.32
+87,2.33837,29.5,28.2
+88,2.34223,29.64,28.36
+89,2.34576,29.06,28.54
+90,2.34993,28.86,28.6
+91,2.35379,29.02,28.66
+92,2.35808,28.94,28.84
+93,2.36281,29.1,28.94
+94,2.36782,28.96,29.06
+95,2.37296,29.22,29.1
+96,2.37777,29.46,29.36
+97,2.38291,29.98,29.58
+98,2.38839,30.16,29.58
+99,2.39361,30.12,29.74
diff --git a/testdata/performance_sweep_island_short.csv b/testdata/performance_sweep_island_short.csv
new file mode 100644
index 0000000..67deed1
--- /dev/null
+++ b/testdata/performance_sweep_island_short.csv
@@ -0,0 +1,101 @@
+TraceDepth,NumBounces,RunTimeMS_SC,RunTimeMS_NoSC
+0,0,7.28,7.64
+1,1,11.96,10.36
+2,1.28055,17.84,11.96
+3,1.32635,21.96,13.8
+4,2.06486,24.16,15.8
+5,2.15891,25.24,16.64
+6,2.17568,25.76,16.96
+7,2.1795,25.68,17.48
+8,2.18016,26.84,18.28
+9,2.18078,27.12,18.48
+10,2.18094,26.92,18.44
+11,2.18091,27,19.2
+12,2.18082,27.36,19.52
+13,2.18063,27,19.48
+14,2.18109,27.16,20.08
+15,2.18131,27.32,20.32
+16,2.18131,27.04,20.32
+17,2.18096,27.28,21.04
+18,2.18094,27.48,21.56
+19,2.18105,27.28,21.84
+20,2.18097,27.32,22.08
+21,2.18115,27.52,22.16
+22,2.18079,27.52,22.32
+23,2.18099,27.36,22.56
+24,2.18113,27.48,22.68
+25,2.18062,27.24,22.76
+26,2.18092,27.52,23
+27,2.18096,27.52,23.12
+28,2.18098,27.36,23.28
+29,2.18082,27.4,23.4
+30,2.18084,27.36,23.44
+31,2.18112,27.64,23.52
+32,2.18104,27.56,23.6
+33,2.18075,27.44,23.72
+34,2.18105,27.84,23.92
+35,2.18101,27.08,23.92
+36,2.1809,27.76,24.2
+37,2.18078,27.8,24.32
+38,2.18148,27.64,24.48
+39,2.18108,27.8,24.6
+40,2.18128,27.32,24.84
+41,2.18108,27.84,25.08
+42,2.18098,28.08,25.44
+43,2.18114,28.2,25.44
+44,2.18067,27.92,25.6
+45,2.18103,27.96,25.88
+46,2.18098,27.6,25.84
+47,2.18125,27.28,26.16
+48,2.18111,27.4,26.36
+49,2.18123,27.28,26.52
+50,2.18101,28.36,26.6
+51,2.18116,27.44,26.84
+52,2.18101,27.2,26.68
+53,2.18125,27.32,26.92
+54,2.18117,27.72,27.08
+55,2.18122,27.72,-1.63018e+009
+56,6475.94,2139.69,1.89306e+008
+57,-1.75987e+022,-3.98669e+037,27.52
+58,-0.315393,-4.06447e+023,-3155.74
+59,3.18853e+032,3.18853e+032,3.18853e+032
+60,2.18082,27.36,27.72
+61,2.18121,27.32,28
+62,2.18096,27.16,28.16
+63,2.1808,27.44,28.36
+64,2.18138,27.2,28.36
+65,2.18113,27.36,28.48
+66,2.18094,27.24,28.76
+67,2.18097,27.16,28.8
+68,2.18094,27.4,29
+69,2.18126,27.16,29.08
+70,2.18076,26.96,29.08
+71,2.18108,27.32,29.36
+72,2.18102,27.16,29.52
+73,2.18069,26.96,29.68
+74,2.18086,27.12,29.68
+75,2.18094,27.12,30
+76,2.18076,27.52,30.08
+77,2.1807,27.48,30.64
+78,2.18108,26.96,30.4
+79,2.18086,26.76,30.44
+80,2.18104,28,31.4
+81,2.18109,27.72,31.28
+82,2.18082,28.04,31.76
+83,2.181,27.92,31.64
+84,2.18079,28.08,31.8
+85,2.181,28.52,31.92
+86,2.1809,28.12,32.12
+87,2.18079,28.12,31.8
+88,2.18119,27.96,32.04
+89,2.18077,28.2,32.2
+90,2.18095,27.76,32.36
+91,2.18093,28.04,32.48
+92,2.18048,28.04,32.48
+93,2.18073,27.84,32.76
+94,2.18103,27.84,32.92
+95,2.18077,27.4,33.08
+96,2.18079,27.28,33.12
+97,2.1807,27.36,33.32
+98,2.18071,27.28,33.32
+99,2.181,28.04,33.52
diff --git a/testdata/performance_sweep_mirrors_long.csv b/testdata/performance_sweep_mirrors_long.csv
new file mode 100644
index 0000000..0ec7b57
--- /dev/null
+++ b/testdata/performance_sweep_mirrors_long.csv
@@ -0,0 +1,101 @@
+TraceDepth,NumBounces,RunTimeMS_SC,RunTimeMS_NoSC
+0,0,8.06,8.14
+1,1,13.34,11.7
+2,1.99996,23.1,15.58
+3,2.98821,34.36,20.7
+4,3.85157,44.9,26.48
+5,4.60391,54.32,32.74
+6,5.23753,62.96,38.68
+7,5.77403,70.14,44.54
+8,6.22316,76.42,50.12
+9,6.59761,82.18,55.94
+10,6.91098,86.72,61.66
+11,7.1704,90.84,67.32
+12,7.38608,94.68,72.8
+13,7.56067,97.52,77.76
+14,7.70196,99.28,82.8
+15,7.81185,101.62,87.72
+16,7.89672,102.62,92.94
+17,7.95923,104.98,97.84
+18,8.00739,105.14,102.44
+19,8.04305,107.96,107.1
+20,8.06818,108.64,111.42
+21,8.08859,109.56,115.78
+22,8.10551,110.48,120.06
+23,8.11761,110.7,124.32
+24,8.1259,110.9,128.44
+25,8.13407,110.3,132.54
+26,8.14076,110.7,136.28
+27,8.14535,111.32,140.38
+28,8.15271,112.18,144.02
+29,8.15587,112.96,147.64
+30,8.15932,112.38,151.04
+31,8.16367,113.68,154.42
+32,8.1664,115.34,157.38
+33,8.17001,115.7,160.2
+34,8.17266,115.78,162.94
+35,8.17457,116.42,165.42
+36,8.17759,115.44,167.9
+37,8.17821,116.1,170.12
+38,8.18143,116.62,172.36
+39,8.1839,117.06,174.6
+40,8.18461,116.58,176.22
+41,8.18612,117.76,178.28
+42,8.18622,117.42,179.6
+43,8.18769,117.68,181.24
+44,8.19151,118.02,182.64
+45,8.19184,118.24,184
+46,8.19222,118.08,185.3
+47,8.19453,118.14,186.5
+48,8.19536,118.46,187.68
+49,8.195,119.42,188.6
+50,8.19726,119.08,189.8
+51,8.19798,119.92,190.88
+52,8.19866,120.32,191.68
+53,8.19621,120.36,192.44
+54,8.19884,120.6,193.42
+55,8.19969,120.76,194.22
+56,8.20084,119.52,194.76
+57,8.20104,119.88,195.46
+58,8.20175,120.88,196.34
+59,8.20291,121.98,196.88
+60,8.20411,121,197.48
+61,8.20436,120.94,198.24
+62,8.20367,120.92,198.68
+63,8.20591,121.9,199.2
+64,8.20681,122.34,199.9
+65,8.20705,125.1,200.3
+66,8.20735,122.96,200.66
+67,8.20465,122.42,200.34
+68,8.19841,122.76,198.96
+69,8.20858,122.72,197.96
+70,8.21462,122.56,196.98
+71,8.22134,123.06,196.18
+72,8.22451,122.88,195.32
+73,8.22412,122.64,194.32
+74,8.22128,124.74,193.4
+75,8.21947,124.98,192.4
+76,8.219,127.58,191.48
+77,8.21435,127.22,190.86
+78,8.219,127.98,190.02
+79,8.22195,128.08,189.54
+80,8.22713,128.1,188.6
+81,8.23446,128.2,187.92
+82,8.24302,128.96,187.38
+83,8.25452,127.2,187.06
+84,8.26632,125.28,186.36
+85,8.27877,126.92,186.28
+86,8.28638,128.24,185.5
+87,8.29384,126.7,185.06
+88,8.30232,125.76,184.6
+89,8.30866,126.66,184.42
+90,8.3186,127.38,183.62
+91,8.32787,131.24,183.06
+92,8.33679,128.54,182.66
+93,8.34497,129.04,182.32
+94,8.35505,130.36,181.74
+95,8.36379,130.36,181.42
+96,8.36947,131.02,180.68
+97,8.37711,129.7,180.46
+98,8.38411,128.32,180.04
+99,8.39038,129.02,179.64
diff --git a/testdata/performance_sweep_mirrors_short.csv b/testdata/performance_sweep_mirrors_short.csv
new file mode 100644
index 0000000..654ce3d
--- /dev/null
+++ b/testdata/performance_sweep_mirrors_short.csv
@@ -0,0 +1,101 @@
+TraceDepth,NumBounces,RunTimeMS_SC,RunTimeMS_NoSC
+0,0,216.6,15.7302
+1,1,224.2,20.136
+2,1.99996,235.6,25.1443
+3,2.98822,246.2,29.1411
+4,3.85164,45.2,34.9508
+5,4.60405,54.8,40.954
+6,5.23865,63.4,46.7587
+7,5.77352,70.8,52.9654
+8,6.22251,76.8,58.7691
+9,6.59756,82.4,64.1715
+10,6.91047,87.2003,69.9721
+11,7.17222,91.4,75.7727
+12,7.38887,94.8,81.178
+13,7.5643,98.2,85.7716
+14,7.6992,-1.#QNAN,91.1825
+15,7.81429,102.4,96.1785
+16,7.89908,104.2,100.987
+17,7.9623,105.8,105.987
+18,8.0074,106.8,110.789
+19,8.042,107.6,115.186
+20,8.07451,108.8,119.79
+21,8.09091,109.4,123.991
+22,8.10624,110.8,128.789
+23,8.12001,111.2,132.791
+24,15.7302,3.15656e+033,2.11877e+011
+25,20.136,3.15656e+033,153.429
+26,25.1443,4.40667e+032,2.39773e+011
+27,29.1411,4.40667e+032,840873
+28,34.9508,4.37878e+032,2.23637e+011
+29,40.954,3.04237e+032,5.27631e+007
+30,46.7587,115.4,3554.83
+31,52.9654,115.6,3.46084e+009
+32,58.7691,116.2,2.09687e+011
+33,64.1715,116.8,168.205
+34,69.9721,116.4,170.996
+35,75.7727,116.8,173.999
+36,81.178,1.01977e+012,176.202
+37,85.7716,117.056,178.603
+38,91.1825,2.42143e+014,181.198
+39,96.1785,3.29346e+009,182.804
+40,100.987,1.7456e+031,184.602
+41,105.987,7.21492e+022,186.208
+42,110.789,2.00636e+017,188.201
+43,115.186,118.6,189.606
+44,119.79,118.2,190.804
+45,123.991,117.8,192.601
+46,128.789,119.2,193.412
+47,132.791,118.6,195.008
+48,2.11877e+011,1.13075e+024,196.207
+49,153.429,15311.2,197.008
+50,2.39773e+011,1.81768e+031,197.81
+51,840873,2.09713e+017,199.203
+52,2.23637e+011,1.13075e+024,199.609
+53,5.27631e+007,120.2,200.612
+54,3554.83,6.97763e+022,201.606
+55,3.46084e+009,1.36401e+034,202.21
+56,2.09687e+011,120.2,203.215
+57,168.205,121,203.809
+58,170.996,122,204.815
+59,173.999,122.6,205.413
+60,176.202,120.6,205.011
+61,178.603,121,206.809
+62,181.198,121.2,207.215
+63,182.804,121.2,207.61
+64,184.602,122.8,208.011
+65,186.208,122.4,208.412
+66,188.201,120.8,208.814
+67,189.606,123.4,209.218
+68,190.804,123.6,210.012
+69,192.601,123.6,209.816
+70,193.412,122.8,210.411
+71,195.008,122.6,211.212
+72,196.207,1.01977e+012,2.11877e+011
+73,197.008,124.256,13480.6
+74,197.81,2.42143e+014,212.468
+75,199.203,3.29346e+009,1.51339e+013
+76,199.609,1.7456e+031,1.92046e+031
+77,200.612,7.21492e+022,1.15701e+027
+78,201.606,2.00636e+017,205.8
+79,202.21,125.4,7.58799e+031
+80,203.215,126.8,206.2
+81,203.809,125.6,207.2
+82,204.815,126.4,206.8
+83,205.413,126.2,206.8
+84,205.011,126.8,2.11877e+011
+85,206.809,127.4,13476.4
+86,207.215,129,208.248
+87,207.61,129.4,1.51339e+013
+88,208.011,130.2,1.92046e+031
+89,208.412,130.6,1.15701e+027
+90,208.814,131.8,208.6
+91,209.218,130.4,7.58799e+031
+92,210.012,132,209.4
+93,209.816,131,209.6
+94,210.411,131.4,209.4
+95,211.212,131.4,210.2
+96,2.11877e+011,130.4,216.6
+97,13480.6,132,224.2
+98,212.468,132.8,235.6
+99,1.51339e+013,133.4,246.2
diff --git a/testdata/plotdata.m b/testdata/plotdata.m
new file mode 100644
index 0000000..63f54f9
--- /dev/null
+++ b/testdata/plotdata.m
@@ -0,0 +1,17 @@
+clear all
+close all
+importfile('performance_sweep_island_long.csv');
+
+figure
+[AX,H1,H2] = plotyy(TraceDepth, RunTimeMS_SC, TraceDepth, NumBounces);
+hold all
+plot(TraceDepth, RunTimeMS_NoSC, 'r')
+
+ylabel('')
+xlabel('Max Trace Depth');
+set(get(AX(1),'Ylabel'),'String','Time Per Frame (ms)') 
+set(get(AX(2),'Ylabel'),'String','Average Bounces Per Ray') 
+
+legend('Stream Compaction Runtime','No Stream Compaction Runtime', 'Avg Num Bounces');
+
+title('Open Environment (Sundial)')
\ No newline at end of file