Fix T44833: Can't use ccl_local space in non-kernel functions

This commit re-shuffles code in split kernel once again and makes it so common parts which is in the headers is only responsible to making all the work needed for specified ray index. Getting ray index, checking for it's validity and enqueuing tasks are now happening in the device specified part of the kernel. This actually makes sense because enqueuing is indeed device-specified and i.e. with CUDA we'll want to enqueue kernels from kernel and avoid CPU roundtrip. TODO: - Kernel comments are still placed in the common header files, but since queue related stuff is not passed to those functions those comments might need to be split as well. Just currently read them considering that they're also covering the way how all devices are invoking the common code path. - Arguments might need to be wrapped into KernelGlobals, so we don't ened to pass all them around as function arguments.
2015-05-26 19:12:49 +05:00
parent 4ffcc6ff56
commit 84ad20acef
22 changed files with 1140 additions and 1105 deletions
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -169,7 +169,6 @@ set(SRC_SPLIT_HEADERS
 	split/kernel_holdout_emission_blurring_pathtermination_ao.h
 	split/kernel_lamp_emission.h
 	split/kernel_next_iteration_setup.h
 	split/kernel_queue_enqueue.h
 	split/kernel_scene_intersect.h
 	split/kernel_shader_eval.h
 	split/kernel_shadow_blocked.h
--- a/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl
@@ -48,34 +48,81 @@ __kernel void kernel_ocl_path_trace_background_buffer_update(
 #endif
        int parallel_samples)                  /* Number of samples to be processed in parallel */
 {
-	kernel_background_buffer_update(globals,
+	ccl_local unsigned int local_queue_atomics;
-	                                data,
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-	                                shader_data,
+		local_queue_atomics = 0;
-	                                per_sample_output_buffers,
+	}
-	                                rng_state,
+	barrier(CLK_LOCAL_MEM_FENCE);
-	                                rng_coop,
+
-	                                throughput_coop,
+	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-	                                PathRadiance_coop,
+	if(ray_index == 0) {
-	                                Ray_coop,
+		/* We will empty this queue in this kernel. */
-	                                PathState_coop,
+		Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
-	                                L_transparent_coop,
+	}
-	                                ray_state,
+	char enqueue_flag = 0;
-	                                sw, sh, sx, sy, stride,
+	ray_index = get_ray_index(ray_index,
-	                                rng_state_offset_x,
+	                          QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-	                                rng_state_offset_y,
+	                          Queue_data,
-	                                rng_state_stride,
+	                          queuesize,
-	                                work_array,
+	                          1);
-	                                Queue_data,
+
-	                                Queue_index,
+#ifdef __COMPUTE_DEVICE_GPU__
-	                                queuesize,
+	/* If we are executing on a GPU device, we exit all threads that are not
-	                                end_sample,
+	 * required.
-	                                start_sample,
+	 *
 	 * If we are executing on a CPU device, then we need to keep all threads
 	 * active since we have barrier() calls later in the kernel. CPU devices,
 	 * expect all threads to execute barrier statement.
 	 */
 	if(ray_index == QUEUE_EMPTY_SLOT) {
 		return;
 	}
 #endif
 #ifndef __COMPUTE_DEVICE_GPU__
 	if(ray_index != QUEUE_EMPTY_SLOT) {
 #endif
 		enqueue_flag =
 			kernel_background_buffer_update(globals,
 			                                data,
 			                                shader_data,
 			                                per_sample_output_buffers,
 			                                rng_state,
 			                                rng_coop,
 			                                throughput_coop,
 			                                PathRadiance_coop,
 			                                Ray_coop,
 			                                PathState_coop,
 			                                L_transparent_coop,
 			                                ray_state,
 			                                sw, sh, sx, sy, stride,
 			                                rng_state_offset_x,
 			                                rng_state_offset_y,
 			                                rng_state_stride,
 			                                work_array,
 			                                end_sample,
 			                                start_sample,
 #ifdef __WORK_STEALING__
-	                                work_pool_wgs,
+			                                work_pool_wgs,
-	                                num_samples,
+			                                num_samples,
 #endif
 #ifdef __KERNEL_DEBUG__
-	                                debugdata_coop,
+			                                debugdata_coop,
 #endif
-	                                parallel_samples);
+			                                parallel_samples,
 			                                ray_index);
 #ifndef __COMPUTE_DEVICE_GPU__
 	}
 #endif
 	/* Enqueue RAY_REGENERATED rays into QUEUE_ACTIVE_AND_REGENERATED_RAYS;
 	 * These rays will be made active during next SceneIntersectkernel.
 	 */
 	enqueue_ray_index_local(ray_index,
 	                        QUEUE_ACTIVE_AND_REGENERATED_RAYS,
 	                        enqueue_flag,
 	                        queuesize,
 	                        &local_queue_atomics,
 	                        Queue_data,
 	                        Queue_index);
 }
--- a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
@@ -17,130 +17,129 @@
 #include "split/kernel_data_init.h"
 __kernel void kernel_ocl_path_trace_data_init(
-	ccl_global char *globals,
+        ccl_global char *globals,
-	ccl_global char *shader_data_sd,                  /* Arguments related to ShaderData */
+        ccl_global char *shader_data_sd,                  /* Arguments related to ShaderData */
-	ccl_global char *shader_data_sd_DL_shadow,     /* Arguments related to ShaderData */
+        ccl_global char *shader_data_sd_DL_shadow,        /* Arguments related to ShaderData */
-	ccl_global float3 *P_sd,
+        ccl_global float3 *P_sd,
-	ccl_global float3 *P_sd_DL_shadow,
+        ccl_global float3 *P_sd_DL_shadow,
-	ccl_global float3 *N_sd,
+        ccl_global float3 *N_sd,
-	ccl_global float3 *N_sd_DL_shadow,
+        ccl_global float3 *N_sd_DL_shadow,
-	ccl_global float3 *Ng_sd,
+        ccl_global float3 *Ng_sd,
-	ccl_global float3 *Ng_sd_DL_shadow,
+        ccl_global float3 *Ng_sd_DL_shadow,
-	ccl_global float3 *I_sd,
+        ccl_global float3 *I_sd,
-	ccl_global float3 *I_sd_DL_shadow,
+        ccl_global float3 *I_sd_DL_shadow,
-	ccl_global int *shader_sd,
+        ccl_global int *shader_sd,
-	ccl_global int *shader_sd_DL_shadow,
+        ccl_global int *shader_sd_DL_shadow,
-	ccl_global int *flag_sd,
+        ccl_global int *flag_sd,
-	ccl_global int *flag_sd_DL_shadow,
+        ccl_global int *flag_sd_DL_shadow,
-	ccl_global int *prim_sd,
+        ccl_global int *prim_sd,
-	ccl_global int *prim_sd_DL_shadow,
+        ccl_global int *prim_sd_DL_shadow,
-	ccl_global int *type_sd,
+        ccl_global int *type_sd,
-	ccl_global int *type_sd_DL_shadow,
+        ccl_global int *type_sd_DL_shadow,
-	ccl_global float *u_sd,
+        ccl_global float *u_sd,
-	ccl_global float *u_sd_DL_shadow,
+        ccl_global float *u_sd_DL_shadow,
-	ccl_global float *v_sd,
+        ccl_global float *v_sd,
-	ccl_global float *v_sd_DL_shadow,
+        ccl_global float *v_sd_DL_shadow,
-	ccl_global int *object_sd,
+        ccl_global int *object_sd,
-	ccl_global int *object_sd_DL_shadow,
+        ccl_global int *object_sd_DL_shadow,
-	ccl_global float *time_sd,
+        ccl_global float *time_sd,
-	ccl_global float *time_sd_DL_shadow,
+        ccl_global float *time_sd_DL_shadow,
-	ccl_global float *ray_length_sd,
+        ccl_global float *ray_length_sd,
-	ccl_global float *ray_length_sd_DL_shadow,
+        ccl_global float *ray_length_sd_DL_shadow,
-	ccl_global int *ray_depth_sd,
+        ccl_global int *ray_depth_sd,
-	ccl_global int *ray_depth_sd_DL_shadow,
+        ccl_global int *ray_depth_sd_DL_shadow,
-	ccl_global int *transparent_depth_sd,
+        ccl_global int *transparent_depth_sd,
-	ccl_global int *transparent_depth_sd_DL_shadow,
+        ccl_global int *transparent_depth_sd_DL_shadow,
-	/* Ray differentials. */
+        /* Ray differentials. */
-	ccl_global differential3 *dP_sd,
+        ccl_global differential3 *dP_sd,
-	ccl_global differential3 *dP_sd_DL_shadow,
+        ccl_global differential3 *dP_sd_DL_shadow,
-	ccl_global differential3 *dI_sd,
+        ccl_global differential3 *dI_sd,
-	ccl_global differential3 *dI_sd_DL_shadow,
+        ccl_global differential3 *dI_sd_DL_shadow,
-	ccl_global differential *du_sd,
+        ccl_global differential *du_sd,
-	ccl_global differential *du_sd_DL_shadow,
+        ccl_global differential *du_sd_DL_shadow,
-	ccl_global differential *dv_sd,
+        ccl_global differential *dv_sd,
-	ccl_global differential *dv_sd_DL_shadow,
+        ccl_global differential *dv_sd_DL_shadow,
-	/* Dp/Du */
+        /* Dp/Du */
-	ccl_global float3 *dPdu_sd,
+        ccl_global float3 *dPdu_sd,
-	ccl_global float3 *dPdu_sd_DL_shadow,
+        ccl_global float3 *dPdu_sd_DL_shadow,
-	ccl_global float3 *dPdv_sd,
+        ccl_global float3 *dPdv_sd,
-	ccl_global float3 *dPdv_sd_DL_shadow,
+        ccl_global float3 *dPdv_sd_DL_shadow,
-	/* Object motion. */
+        /* Object motion. */
-	ccl_global Transform *ob_tfm_sd,
+        ccl_global Transform *ob_tfm_sd,
-	ccl_global Transform *ob_tfm_sd_DL_shadow,
+        ccl_global Transform *ob_tfm_sd_DL_shadow,
-	ccl_global Transform *ob_itfm_sd,
+        ccl_global Transform *ob_itfm_sd,
-	ccl_global Transform *ob_itfm_sd_DL_shadow,
+        ccl_global Transform *ob_itfm_sd_DL_shadow,
-	ShaderClosure *closure_sd,
+        ShaderClosure *closure_sd,
-	ShaderClosure *closure_sd_DL_shadow,
+        ShaderClosure *closure_sd_DL_shadow,
-	ccl_global int *num_closure_sd,
+        ccl_global int *num_closure_sd,
-	ccl_global int *num_closure_sd_DL_shadow,
+        ccl_global int *num_closure_sd_DL_shadow,
-	ccl_global float *randb_closure_sd,
+        ccl_global float *randb_closure_sd,
-	ccl_global float *randb_closure_sd_DL_shadow,
+        ccl_global float *randb_closure_sd_DL_shadow,
-	ccl_global float3 *ray_P_sd,
+        ccl_global float3 *ray_P_sd,
-	ccl_global float3 *ray_P_sd_DL_shadow,
+        ccl_global float3 *ray_P_sd_DL_shadow,
-	ccl_global differential3 *ray_dP_sd,
+        ccl_global differential3 *ray_dP_sd,
-	ccl_global differential3 *ray_dP_sd_DL_shadow,
+        ccl_global differential3 *ray_dP_sd_DL_shadow,
-	ccl_constant KernelData *data,
+        ccl_constant KernelData *data,
-	ccl_global float *per_sample_output_buffers,
+        ccl_global float *per_sample_output_buffers,
-	ccl_global uint *rng_state,
+        ccl_global uint *rng_state,
-	ccl_global uint *rng_coop,                   /* rng array to store rng values for all rays */
+        ccl_global uint *rng_coop,                   /* rng array to store rng values for all rays */
-	ccl_global float3 *throughput_coop,          /* throughput array to store throughput values for all rays */
+        ccl_global float3 *throughput_coop,          /* throughput array to store throughput values for all rays */
-	ccl_global float *L_transparent_coop,        /* L_transparent array to store L_transparent values for all rays */
+        ccl_global float *L_transparent_coop,        /* L_transparent array to store L_transparent values for all rays */
-	PathRadiance *PathRadiance_coop,  /* PathRadiance array to store PathRadiance values for all rays */
+        PathRadiance *PathRadiance_coop,             /* PathRadiance array to store PathRadiance values for all rays */
-	ccl_global Ray *Ray_coop,                    /* Ray array to store Ray information for all rays */
+        ccl_global Ray *Ray_coop,                    /* Ray array to store Ray information for all rays */
-	ccl_global PathState *PathState_coop,        /* PathState array to store PathState information for all rays */
+        ccl_global PathState *PathState_coop,        /* PathState array to store PathState information for all rays */
-	ccl_global char *ray_state,                  /* Stores information on current state of a ray */
+        ccl_global char *ray_state,                  /* Stores information on current state of a ray */
-#define KERNEL_TEX(type, ttype, name) \
+#define KERNEL_TEX(type, ttype, name)                                   \
-	ccl_global type *name,
+        ccl_global type *name,
 #include "../../kernel_textures.h"
-	int start_sample, int sx, int sy, int sw, int sh, int offset, int stride,
+        int start_sample, int sx, int sy, int sw, int sh, int offset, int stride,
-	int rng_state_offset_x,
+        int rng_state_offset_x,
-	int rng_state_offset_y,
+        int rng_state_offset_y,
-	int rng_state_stride,
+        int rng_state_stride,
-	ccl_global int *Queue_data,                  /* Memory for queues */
+        ccl_global int *Queue_data,                  /* Memory for queues */
-	ccl_global int *Queue_index,                 /* Tracks the number of elements in queues */
+        ccl_global int *Queue_index,                 /* Tracks the number of elements in queues */
-	int queuesize,                               /* size (capacity) of the queue */
+        int queuesize,                               /* size (capacity) of the queue */
-	ccl_global char *use_queues_flag,            /* flag to decide if scene-intersect kernel should use queues to fetch ray index */
+        ccl_global char *use_queues_flag,            /* flag to decide if scene-intersect kernel should use queues to fetch ray index */
-	ccl_global unsigned int *work_array,         /* work array to store which work each ray belongs to */
+        ccl_global unsigned int *work_array,         /* work array to store which work each ray belongs to */
 #ifdef __WORK_STEALING__
-	ccl_global unsigned int *work_pool_wgs,      /* Work pool for each work group */
+        ccl_global unsigned int *work_pool_wgs,      /* Work pool for each work group */
-	unsigned int num_samples,                    /* Total number of samples per pixel */
+        unsigned int num_samples,                    /* Total number of samples per pixel */
 #endif
 #ifdef __KERNEL_DEBUG__
-	DebugData *debugdata_coop,
+        DebugData *debugdata_coop,
 #endif
-	int parallel_samples                         /* Number of samples to be processed in parallel */
+        int parallel_samples)                        /* Number of samples to be processed in parallel */
 	)
 {
 	kernel_data_init(globals,
 	                 shader_data_sd,
--- a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
@@ -31,17 +31,60 @@ __kernel void kernel_ocl_path_trace_direct_lighting(
        ccl_global int *Queue_index,            /* Tracks the number of elements in each queue */
        int queuesize)                          /* Size (capacity) of each queue */
 {
-	kernel_direct_lighting(globals,
+	ccl_local unsigned int local_queue_atomics;
-	                       data,
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-	                       shader_data,
+		local_queue_atomics = 0;
-	                       shader_DL,
+	}
-	                       rng_coop,
+	barrier(CLK_LOCAL_MEM_FENCE);
-	                       PathState_coop,
+
-	                       ISLamp_coop,
+	char enqueue_flag = 0;
-	                       LightRay_coop,
+	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-	                       BSDFEval_coop,
+	ray_index = get_ray_index(ray_index,
-	                       ray_state,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-	                       Queue_data,
+	                          Queue_data,
-	                       Queue_index,
+	                          queuesize,
-	                       queuesize);
+	                          0);
 #ifdef __COMPUTE_DEVICE_GPU__
 	/* If we are executing on a GPU device, we exit all threads that are not
 	 * required.
 	 *
 	 * If we are executing on a CPU device, then we need to keep all threads
 	 * active since we have barrier() calls later in the kernel. CPU devices,
 	 * expect all threads to execute barrier statement.
 	 */
 	if(ray_index == QUEUE_EMPTY_SLOT) {
 		return;
 	}
 #endif
 #ifndef __COMPUTE_DEVICE_GPU__
 	if(ray_index != QUEUE_EMPTY_SLOT) {
 #endif
 		enqueue_flag = kernel_direct_lighting(globals,
 		                                      data,
 		                                      shader_data,
 		                                      shader_DL,
 		                                      rng_coop,
 		                                      PathState_coop,
 		                                      ISLamp_coop,
 		                                      LightRay_coop,
 		                                      BSDFEval_coop,
 		                                      ray_state,
 		                                      ray_index);
 #ifndef __COMPUTE_DEVICE_GPU__
 	}
 #endif
 #ifdef __EMISSION__
 	/* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */
 	enqueue_ray_index_local(ray_index,
 	                        QUEUE_SHADOW_RAY_CAST_DL_RAYS,
 	                        enqueue_flag,
 	                        queuesize,
 	                        &local_queue_atomics,
 	                        Queue_data,
 	                        Queue_index);
 #endif
 }
--- a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
@@ -41,27 +41,84 @@ __kernel void kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao
 #endif
        int parallel_samples)                  /* Number of samples to be processed in parallel */
 {
-	kernel_holdout_emission_blurring_pathtermination_ao(globals,
+	ccl_local unsigned int local_queue_atomics_bg;
-	                                                    data,
+	ccl_local unsigned int local_queue_atomics_ao;
-	                                                    shader_data,
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-	                                                    per_sample_output_buffers,
+		local_queue_atomics_bg = 0;
-	                                                    rng_coop,
+		local_queue_atomics_ao = 0;
-	                                                    throughput_coop,
+	}
-	                                                    L_transparent_coop,
+	barrier(CLK_LOCAL_MEM_FENCE);
-	                                                    PathRadiance_coop,
+
-	                                                    PathState_coop,
+	char enqueue_flag = 0;
-	                                                    Intersection_coop,
+	char enqueue_flag_AO_SHADOW_RAY_CAST = 0;
-	                                                    AOAlpha_coop,
+	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-	                                                    AOBSDF_coop,
+	ray_index = get_ray_index(ray_index,
-	                                                    AOLightRay_coop,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-	                                                    sw, sh, sx, sy, stride,
+	                          Queue_data,
-	                                                    ray_state,
+	                          queuesize,
-	                                                    work_array,
+	                          0);
-	                                                    Queue_data,
+
-	                                                    Queue_index,
+#ifdef __COMPUTE_DEVICE_GPU__
-	                                                    queuesize,
+	/* If we are executing on a GPU device, we exit all threads that are not
-#ifdef __WORK_STEALING__
+	 * required.
-	                                                    start_sample,
+	 *
 	 * If we are executing on a CPU device, then we need to keep all threads
 	 * active since we have barrier() calls later in the kernel. CPU devices,
 	 * expect all threads to execute barrier statement.
 	 */
 	if(ray_index == QUEUE_EMPTY_SLOT) {
 		return;
 	}
 #endif  /* __COMPUTE_DEVICE_GPU__ */
 #ifndef __COMPUTE_DEVICE_GPU__
 	if(ray_index != QUEUE_EMPTY_SLOT) {
 #endif
 		kernel_holdout_emission_blurring_pathtermination_ao(
 		        globals,
 		        data,
 		        shader_data,
 		        per_sample_output_buffers,
 		        rng_coop,
 		        throughput_coop,
 		        L_transparent_coop,
 		        PathRadiance_coop,
 		        PathState_coop,
 		        Intersection_coop,
 		        AOAlpha_coop,
 		        AOBSDF_coop,
 		        AOLightRay_coop,
 		        sw, sh, sx, sy, stride,
 		        ray_state,
 		        work_array,
 #ifdef __WORK_STEALING__
 		        start_sample,
 #endif
 		        parallel_samples,
 		        ray_index,
 		        &enqueue_flag,
 		        &enqueue_flag_AO_SHADOW_RAY_CAST);
 #ifndef __COMPUTE_DEVICE_GPU__
 	}
 #endif
 	/* Enqueue RAY_UPDATE_BUFFER rays. */
 	enqueue_ray_index_local(ray_index,
 	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
 	                        enqueue_flag,
 	                        queuesize,
 	                        &local_queue_atomics_bg,
 	                        Queue_data,
 	                        Queue_index);
 #ifdef __AO__
 	/* Enqueue to-shadow-ray-cast rays. */
 	enqueue_ray_index_local(ray_index,
 	                        QUEUE_SHADOW_RAY_CAST_AO_RAYS,
 	                        enqueue_flag_AO_SHADOW_RAY_CAST,
 	                        queuesize,
 	                        &local_queue_atomics_ao,
 	                        Queue_data,
 	                        Queue_index);
 #endif
 	                                                    parallel_samples);
 }
--- a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
@@ -17,23 +17,57 @@
 #include "split/kernel_lamp_emission.h"
 __kernel void kernel_ocl_path_trace_lamp_emission(
-	ccl_global char *globals,
+        ccl_global char *globals,
-	ccl_constant KernelData *data,
+        ccl_constant KernelData *data,
-	ccl_global char *shader_data,               /* Required for lamp emission */
+        ccl_global char *shader_data,          /* Required for lamp emission */
-	ccl_global float3 *throughput_coop,         /* Required for lamp emission */
+        ccl_global float3 *throughput_coop,    /* Required for lamp emission */
-	PathRadiance *PathRadiance_coop, /* Required for lamp emission */
+        PathRadiance *PathRadiance_coop,       /* Required for lamp emission */
-	ccl_global Ray *Ray_coop,                   /* Required for lamp emission */
+        ccl_global Ray *Ray_coop,              /* Required for lamp emission */
-	ccl_global PathState *PathState_coop,       /* Required for lamp emission */
+        ccl_global PathState *PathState_coop,  /* Required for lamp emission */
-	Intersection *Intersection_coop, /* Required for lamp emission */
+        Intersection *Intersection_coop,       /* Required for lamp emission */
-	ccl_global char *ray_state,                 /* Denotes the state of each ray */
+        ccl_global char *ray_state,            /* Denotes the state of each ray */
-	int sw, int sh,
+        int sw, int sh,
-	ccl_global int *Queue_data,                 /* Memory for queues */
+        ccl_global int *Queue_data,            /* Memory for queues */
-	ccl_global int *Queue_index,                /* Tracks the number of elements in queues */
+        ccl_global int *Queue_index,           /* Tracks the number of elements in queues */
-	int queuesize,                              /* Size (capacity) of queues */
+        int queuesize,                         /* Size (capacity) of queues */
-	ccl_global char *use_queues_flag,           /* used to decide if this kernel should use queues to fetch ray index */
+        ccl_global char *use_queues_flag,      /* Used to decide if this kernel should use
-	int parallel_samples                        /* Number of samples to be processed in parallel */
+                                                * queues to fetch ray index
-	)
+                                                */
        int parallel_samples)                  /* Number of samples to be processed in parallel */
 {
 	int x = get_global_id(0);
 	int y = get_global_id(1);
 	/* We will empty this queue in this kernel. */
 	if(get_global_id(0) == 0 && get_global_id(1) == 0) {
 		Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
 	}
 	/* Fetch use_queues_flag. */
 	ccl_local char local_use_queues_flag;
 	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
 		local_use_queues_flag = use_queues_flag[0];
 	}
 	barrier(CLK_LOCAL_MEM_FENCE);
 	int ray_index;
 	if(local_use_queues_flag) {
 		int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
 		ray_index = get_ray_index(thread_index,
 		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
 		                          Queue_data,
 		                          queuesize,
 		                          1);
 		if(ray_index == QUEUE_EMPTY_SLOT) {
 			return;
 		}
 	} else {
 		if(x < (sw * parallel_samples) && y < sh){
 			ray_index = x + y * (sw * parallel_samples);
 		} else {
 			return;
 		}
 	}
 	kernel_lamp_emission(globals,
 	                     data,
 	                     shader_data,
@@ -44,9 +78,7 @@ __kernel void kernel_ocl_path_trace_lamp_emission(
 	                     Intersection_coop,
 	                     ray_state,
 	                     sw, sh,
 	                     Queue_data,
 	                     Queue_index,
 	                     queuesize,
 	                     use_queues_flag,
-	                     parallel_samples);
+	                     parallel_samples,
 	                     ray_index);
 }
--- a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
@@ -35,25 +35,81 @@ __kernel void kernel_ocl_path_trace_next_iteration_setup(
        ccl_global int *Queue_data,           /* Queue memory */
        ccl_global int *Queue_index,          /* Tracks the number of elements in each queue */
        int queuesize,                        /* Size (capacity) of each queue */
-        ccl_global char *use_queues_flag)      /* flag to decide if scene_intersect kernel should use queues to fetch ray index */
+        ccl_global char *use_queues_flag)     /* flag to decide if scene_intersect kernel should
                                               * use queues to fetch ray index */
 {
-	kernel_next_iteration_setup(globals,
+	ccl_local unsigned int local_queue_atomics;
-	                            data,
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-	                            shader_data,
+		local_queue_atomics = 0;
-	                            rng_coop,
+	}
-	                            throughput_coop,
+	barrier(CLK_LOCAL_MEM_FENCE);
-	                            PathRadiance_coop,
+
-	                            Ray_coop,
+	if(get_global_id(0) == 0 && get_global_id(1) == 0) {
-	                            PathState_coop,
+		/* If we are here, then it means that scene-intersect kernel
-	                            LightRay_dl_coop,
+		* has already been executed atleast once. From the next time,
-	                            ISLamp_coop,
+		* scene-intersect kernel may operate on queues to fetch ray index
-	                            BSDFEval_coop,
+		*/
-	                            LightRay_ao_coop,
+		use_queues_flag[0] = 1;
-	                            AOBSDF_coop,
+
-	                            AOAlpha_coop,
+		/* Mark queue indices of QUEUE_SHADOW_RAY_CAST_AO_RAYS and
-	                            ray_state,
+		 * QUEUE_SHADOW_RAY_CAST_DL_RAYS queues that were made empty during the
-	                            Queue_data,
+		 * previous kernel.
-	                            Queue_index,
+		 */
-	                            queuesize,
+		Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
-	                            use_queues_flag);
+		Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
 	}
 	char enqueue_flag = 0;
 	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
 	ray_index = get_ray_index(ray_index,
 	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
 	                          Queue_data,
 	                          queuesize,
 	                          0);
 #ifdef __COMPUTE_DEVICE_GPU__
 	/* If we are executing on a GPU device, we exit all threads that are not
 	 * required.
 	 *
 	 * If we are executing on a CPU device, then we need to keep all threads
 	 * active since we have barrier() calls later in the kernel. CPU devices,
 	 * expect all threads to execute barrier statement.
 	 */
 	if(ray_index == QUEUE_EMPTY_SLOT) {
 		return;
 	}
 #endif
 #ifndef __COMPUTE_DEVICE_GPU__
 	if(ray_index != QUEUE_EMPTY_SLOT) {
 #endif
 		enqueue_flag = kernel_next_iteration_setup(globals,
 		                                           data,
 		                                           shader_data,
 		                                           rng_coop,
 		                                           throughput_coop,
 		                                           PathRadiance_coop,
 		                                           Ray_coop,
 		                                           PathState_coop,
 		                                           LightRay_dl_coop,
 		                                           ISLamp_coop,
 		                                           BSDFEval_coop,
 		                                           LightRay_ao_coop,
 		                                           AOBSDF_coop,
 		                                           AOAlpha_coop,
 		                                           ray_state,
 		                                           use_queues_flag,
 		                                           ray_index);
 #ifndef __COMPUTE_DEVICE_GPU__
 	}
 #endif
 	/* Enqueue RAY_UPDATE_BUFFER rays. */
 	enqueue_ray_index_local(ray_index,
 	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
 	                        enqueue_flag,
 	                        queuesize,
 	                        &local_queue_atomics,
 	                        Queue_data,
 	                        Queue_index);
 }
--- a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
@@ -14,16 +14,93 @@
 * limitations under the License.
 */
-#include "split/kernel_queue_enqueue.h"
+#include "../../kernel_compat_opencl.h"
 #include "../../kernel_math.h"
 #include "../../kernel_types.h"
 #include "../../kernel_globals.h"
 #include "../../kernel_queues.h"
 /*
 * The kernel "kernel_queue_enqueue" enqueues rays of
 * different ray state into their appropriate Queues;
 * 1. Rays that have been determined to hit the background from the
 * "kernel_scene_intersect" kernel
 * are enqueued in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
 * 2. Rays that have been determined to be actively participating in path-iteration will be enqueued into QUEUE_ACTIVE_AND_REGENERATED_RAYS.
 *
 * The input and output of the kernel is as follows,
 *
 * ray_state -------------------------------------------|--- kernel_queue_enqueue --|--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
 * Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------|                           |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
 * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---|                           |
 * queuesize -------------------------------------------|                           |
 *
 * Note on Queues :
 * State of queues during the first time this kernel is called :
 * At entry,
 * Both QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.
 * At exit,
 * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays
 * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_HIT_BACKGROUND rays.
 *
 * State of queue during other times this kernel is called :
 * At entry,
 * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be empty.
 * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will contain RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays.
 * At exit,
 * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays.
 * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays.
 */
 __kernel void kernel_ocl_path_trace_queue_enqueue(
        ccl_global int *Queue_data,   /* Queue memory */
        ccl_global int *Queue_index,  /* Tracks the number of elements in each queue */
        ccl_global char *ray_state,   /* Denotes the state of each ray */
        int queuesize)                /* Size (capacity) of each queue */
 {
-	kernel_queue_enqueue(Queue_data,
+	/* We have only 2 cases (Hit/Not-Hit) */
-	                     Queue_index,
+	ccl_local unsigned int local_queue_atomics[2];
-	                     ray_state,
+
-	                     queuesize);
+	int lidx = get_local_id(1) * get_local_size(0) + get_local_id(0);
 	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
 	if(lidx < 2 ) {
 		local_queue_atomics[lidx] = 0;
 	}
 	barrier(CLK_LOCAL_MEM_FENCE);
 	int queue_number = -1;
 	if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
 		queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
 	}
 	else if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
 		queue_number = QUEUE_ACTIVE_AND_REGENERATED_RAYS;
 	}
 	unsigned int my_lqidx;
 	if(queue_number != -1) {
 		my_lqidx = get_local_queue_index(queue_number, local_queue_atomics);
 	}
 	barrier(CLK_LOCAL_MEM_FENCE);
 	if(lidx == 0) {
 		local_queue_atomics[QUEUE_ACTIVE_AND_REGENERATED_RAYS] =
 		        get_global_per_queue_offset(QUEUE_ACTIVE_AND_REGENERATED_RAYS,
 		                                    local_queue_atomics,
 		                                    Queue_index);
 		local_queue_atomics[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] =
 		        get_global_per_queue_offset(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
 		                                    local_queue_atomics,
 		                                    Queue_index);
 	}
 	barrier(CLK_LOCAL_MEM_FENCE);
 	unsigned int my_gqidx;
 	if(queue_number != -1) {
 		my_gqidx = get_global_queue_index(queue_number,
 		                                  queuesize,
 		                                  my_lqidx,
 		                                  local_queue_atomics);
 		Queue_data[my_gqidx] = ray_index;
 	}
 }
--- a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
@@ -28,12 +28,43 @@ __kernel void kernel_ocl_path_trace_scene_intersect(
        ccl_global int *Queue_data,            /* Memory for queues */
        ccl_global int *Queue_index,           /* Tracks the number of elements in queues */
        int queuesize,                         /* Size (capacity) of queues */
-        ccl_global char *use_queues_flag,      /* used to decide if this kernel should use queues to fetch ray index */
+        ccl_global char *use_queues_flag,      /* used to decide if this kernel should use
                                                * queues to fetch ray index */
 #ifdef __KERNEL_DEBUG__
        DebugData *debugdata_coop,
 #endif
        int parallel_samples)                  /* Number of samples to be processed in parallel */
 {
 	int x = get_global_id(0);
 	int y = get_global_id(1);
 	/* Fetch use_queues_flag */
 	ccl_local char local_use_queues_flag;
 	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
 		local_use_queues_flag = use_queues_flag[0];
 	}
 	barrier(CLK_LOCAL_MEM_FENCE);
 	int ray_index;
 	if(local_use_queues_flag) {
 		int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
 		ray_index = get_ray_index(thread_index,
 		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
 		                          Queue_data,
 		                          queuesize,
 		                          0);
 		if(ray_index == QUEUE_EMPTY_SLOT) {
 			return;
 		}
 	} else {
 		if(x < (sw * parallel_samples) && y < sh){
 			ray_index = x + y * (sw * parallel_samples);
 		} else {
 			return;
 		}
 	}
 	kernel_scene_intersect(globals,
 	                       data,
 	                       rng_coop,
@@ -42,12 +73,10 @@ __kernel void kernel_ocl_path_trace_scene_intersect(
 	                       Intersection_coop,
 	                       ray_state,
 	                       sw, sh,
 	                       Queue_data,
 	                       Queue_index,
 	                       queuesize,
 	                       use_queues_flag,
 #ifdef __KERNEL_DEBUG__
 	                       debugdata_coop,
 #endif
-	                       parallel_samples);
+	                       parallel_samples,
 	                       ray_index);
 }
--- a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
@@ -29,6 +29,34 @@ __kernel void kernel_ocl_path_trace_shader_eval(
        ccl_global int *Queue_index,           /* Tracks the number of elements in each queue */
        int queuesize)                         /* Size (capacity) of each queue */
 {
 	/* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */
 	ccl_local unsigned int local_queue_atomics;
 	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
 		local_queue_atomics = 0;
 	}
 	barrier(CLK_LOCAL_MEM_FENCE);
 	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
 	ray_index = get_ray_index(ray_index,
 	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
 	                          Queue_data,
 	                          queuesize,
 	                          0);
 	if(ray_index == QUEUE_EMPTY_SLOT) {
 		return;
 	}
 	char enqueue_flag = (IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 : 0;
 	enqueue_ray_index_local(ray_index,
 	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
 	                        enqueue_flag,
 	                        queuesize,
 	                        &local_queue_atomics,
 	                        Queue_data,
 	                        Queue_index);
 	/* Continue on with shader evaluation. */
 	kernel_shader_eval(globals,
 	                   data,
 	                   shader_data,
@@ -37,7 +65,5 @@ __kernel void kernel_ocl_path_trace_shader_eval(
 	                   PathState_coop,
 	                   Intersection_coop,
 	                   ray_state,
-	                   Queue_data,
+	                   ray_index);
 	                   Queue_index,
 	                   queuesize);
 }
--- a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl
@@ -31,6 +31,43 @@ __kernel void kernel_ocl_path_trace_shadow_blocked(
        int queuesize,                         /* Size (capacity) of each queue */
        int total_num_rays)
 {
 #if 0
 	/* We will make the Queue_index entries '0' in the next kernel. */
 	if(get_global_id(0) == 0 && get_global_id(1) == 0) {
 		/* We empty this queue here */
 		Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
 		Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
 	}
 #endif
 	int lidx = get_local_id(1) * get_local_id(0) + get_local_id(0);
 	ccl_local unsigned int ao_queue_length;
 	ccl_local unsigned int dl_queue_length;
 	if(lidx == 0) {
 		ao_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS];
 		dl_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS];
 	}
 	barrier(CLK_LOCAL_MEM_FENCE);
 	/* flag determining if the current ray is to process shadow ray for AO or DL */
 	char shadow_blocked_type = -1;
 	int ray_index = QUEUE_EMPTY_SLOT;
 	int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
 	if(thread_index < ao_queue_length + dl_queue_length) {
 		if(thread_index < ao_queue_length) {
 			ray_index = get_ray_index(thread_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS, Queue_data, queuesize, 1);
 			shadow_blocked_type = RAY_SHADOW_RAY_CAST_AO;
 		} else {
 			ray_index = get_ray_index(thread_index - ao_queue_length, QUEUE_SHADOW_RAY_CAST_DL_RAYS, Queue_data, queuesize, 1);
 			shadow_blocked_type = RAY_SHADOW_RAY_CAST_DL;
 		}
 	}
 	if(ray_index == QUEUE_EMPTY_SLOT)
 		return;
 	kernel_shadow_blocked(globals,
 	                      data,
 	                      shader_shadow,
@@ -40,8 +77,7 @@ __kernel void kernel_ocl_path_trace_shadow_blocked(
 	                      Intersection_coop_AO,
 	                      Intersection_coop_DL,
 	                      ray_state,
-	                      Queue_data,
+	                      total_num_rays,
-	                      Queue_index,
+	                      shadow_blocked_type,
-	                      queuesize,
+	                      ray_index);
 	                      total_num_rays);
 }
--- a/intern/cycles/kernel/split/kernel_background_buffer_update.h
+++ b/intern/cycles/kernel/split/kernel_background_buffer_update.h
@@ -16,8 +16,7 @@
 #include "kernel_split_common.h"
-/*
+/* Note on kernel_background_buffer_update kernel.
 * Note on kernel_background_buffer_update kernel.
 * This is the fourth kernel in the ray tracing logic, and the third
 * of the path iteration kernels. This kernel takes care of rays that hit
 * the background (sceneintersect kernel), and for the rays of
@@ -70,121 +69,93 @@
 * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
 * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty
 */
-__kernel void kernel_background_buffer_update(
+ccl_device char kernel_background_buffer_update(
-	ccl_global char *globals,
+        ccl_global char *globals,
-	ccl_constant KernelData *data,
+        ccl_constant KernelData *data,
-	ccl_global char *shader_data,
+        ccl_global char *shader_data,
-	ccl_global float *per_sample_output_buffers,
+        ccl_global float *per_sample_output_buffers,
-	ccl_global uint *rng_state,
+        ccl_global uint *rng_state,
-	ccl_global uint *rng_coop,                   /* Required for buffer Update */
+        ccl_global uint *rng_coop,             /* Required for buffer Update */
-	ccl_global float3 *throughput_coop,          /* Required for background hit processing */
+        ccl_global float3 *throughput_coop,    /* Required for background hit processing */
-	PathRadiance *PathRadiance_coop,  /* Required for background hit processing and buffer Update */
+        PathRadiance *PathRadiance_coop,       /* Required for background hit processing and buffer Update */
-	ccl_global Ray *Ray_coop,                    /* Required for background hit processing */
+        ccl_global Ray *Ray_coop,              /* Required for background hit processing */
-	ccl_global PathState *PathState_coop,        /* Required for background hit processing */
+        ccl_global PathState *PathState_coop,  /* Required for background hit processing */
-	ccl_global float *L_transparent_coop,        /* Required for background hit processing and buffer Update */
+        ccl_global float *L_transparent_coop,  /* Required for background hit processing and buffer Update */
-	ccl_global char *ray_state,                  /* Stores information on the current state of a ray */
+        ccl_global char *ray_state,            /* Stores information on the current state of a ray */
-	int sw, int sh, int sx, int sy, int stride,
+        int sw, int sh, int sx, int sy, int stride,
-	int rng_state_offset_x,
+        int rng_state_offset_x,
-	int rng_state_offset_y,
+        int rng_state_offset_y,
-	int rng_state_stride,
+        int rng_state_stride,
-	ccl_global unsigned int *work_array,         /* Denotes work of each ray */
+        ccl_global unsigned int *work_array,   /* Denotes work of each ray */
-	ccl_global int *Queue_data,                  /* Queues memory */
+        int end_sample,
-	ccl_global int *Queue_index,                 /* Tracks the number of elements in each queue */
+        int start_sample,
 	int queuesize,                               /* Size (capacity) of each queue */
 	int end_sample,
 	int start_sample,
 #ifdef __WORK_STEALING__
-	ccl_global unsigned int *work_pool_wgs,
+        ccl_global unsigned int *work_pool_wgs,
-	unsigned int num_samples,
+        unsigned int num_samples,
 #endif
 #ifdef __KERNEL_DEBUG__
-	DebugData *debugdata_coop,
+        DebugData *debugdata_coop,
 #endif
-	int parallel_samples                         /* Number of samples to be processed in parallel */
+        int parallel_samples,                  /* Number of samples to be processed in parallel */
-	)
+        int ray_index)
 {
 	ccl_local unsigned int local_queue_atomics;
 	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
 		local_queue_atomics = 0;
 	}
 	barrier(CLK_LOCAL_MEM_FENCE);
 	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
 	if(ray_index == 0) {
 		/* We will empty this queue in this kernel */
 		Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
 	}
 	char enqueue_flag = 0;
 	ray_index = get_ray_index(ray_index, QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, Queue_data, queuesize, 1);
-#ifdef __COMPUTE_DEVICE_GPU__
+	/* Load kernel globals structure and ShaderData strucuture */
-	/* If we are executing on a GPU device, we exit all threads that are not required.
+	KernelGlobals *kg = (KernelGlobals *)globals;
-	 * If we are executing on a CPU device, then we need to keep all threads active
+	ShaderData *sd = (ShaderData *)shader_data;
 	 * since we have barrier() calls later in the kernel. CPU devices
 	 * expect all threads to execute barrier statement.
 	 */
 	if(ray_index == QUEUE_EMPTY_SLOT)
 		return;
 #endif
 #ifndef __COMPUTE_DEVICE_GPU__
 	if(ray_index != QUEUE_EMPTY_SLOT) {
 #endif
 		/* Load kernel globals structure and ShaderData strucuture */
 		KernelGlobals *kg = (KernelGlobals *)globals;
 		ShaderData *sd = (ShaderData *)shader_data;
 #ifdef __KERNEL_DEBUG__
-		DebugData *debug_data = &debugdata_coop[ray_index];
+	DebugData *debug_data = &debugdata_coop[ray_index];
 #endif
-		ccl_global PathState *state = &PathState_coop[ray_index];
+	ccl_global PathState *state = &PathState_coop[ray_index];
-		PathRadiance *L = L = &PathRadiance_coop[ray_index];
+	PathRadiance *L = L = &PathRadiance_coop[ray_index];
-		ccl_global Ray *ray = &Ray_coop[ray_index];
+	ccl_global Ray *ray = &Ray_coop[ray_index];
-		ccl_global float3 *throughput = &throughput_coop[ray_index];
+	ccl_global float3 *throughput = &throughput_coop[ray_index];
-		ccl_global float *L_transparent = &L_transparent_coop[ray_index];
+	ccl_global float *L_transparent = &L_transparent_coop[ray_index];
-		ccl_global uint *rng = &rng_coop[ray_index];
+	ccl_global uint *rng = &rng_coop[ray_index];
 #ifdef __WORK_STEALING__
-		unsigned int my_work;
+	unsigned int my_work;
-		ccl_global float *initial_per_sample_output_buffers;
+	ccl_global float *initial_per_sample_output_buffers;
-		ccl_global uint *initial_rng;
+	ccl_global uint *initial_rng;
 #endif
-		unsigned int sample;
+	unsigned int sample;
-		unsigned int tile_x;
+	unsigned int tile_x;
-		unsigned int tile_y;
+	unsigned int tile_y;
-		unsigned int pixel_x;
+	unsigned int pixel_x;
-		unsigned int pixel_y;
+	unsigned int pixel_y;
-		unsigned int my_sample_tile;
+	unsigned int my_sample_tile;
 #ifdef __WORK_STEALING__
-		my_work = work_array[ray_index];
+	my_work = work_array[ray_index];
-		sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
+	sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
-		get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index);
+	get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index);
-		my_sample_tile = 0;
+	my_sample_tile = 0;
-		initial_per_sample_output_buffers = per_sample_output_buffers;
+	initial_per_sample_output_buffers = per_sample_output_buffers;
-		initial_rng = rng_state;
+	initial_rng = rng_state;
 #else // __WORK_STEALING__
-		sample = work_array[ray_index];
+	sample = work_array[ray_index];
-		int tile_index = ray_index / parallel_samples;
+	int tile_index = ray_index / parallel_samples;
-		/* buffer and rng_state's stride is "stride". Find x and y using ray_index */
+	/* buffer and rng_state's stride is "stride". Find x and y using ray_index */
-		tile_x = tile_index % sw;
+	tile_x = tile_index % sw;
-		tile_y = tile_index / sw;
+	tile_y = tile_index / sw;
-		my_sample_tile = ray_index - (tile_index * parallel_samples);
+	my_sample_tile = ray_index - (tile_index * parallel_samples);
 #endif
-		rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride;
+	rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride;
-		per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride;
+	per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride;
-		if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+	if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
-			/* eval background shader if nothing hit */
+		/* eval background shader if nothing hit */
-			if(kernel_data.background.transparent && (state->flag & PATH_RAY_CAMERA)) {
+		if(kernel_data.background.transparent && (state->flag & PATH_RAY_CAMERA)) {
-				*L_transparent = (*L_transparent) + average((*throughput));
+			*L_transparent = (*L_transparent) + average((*throughput));
 #ifdef __PASSES__
 			if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
 #endif
 				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-			}
+		}
-			if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND))
+		if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND))
 			{
 #ifdef __BACKGROUND__
 				/* sample background shader */
@@ -193,90 +164,83 @@ __kernel void kernel_background_buffer_update(
 #endif
 				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
 			}
-		}
+	}
-		if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
+	if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
-			float3 L_sum = path_radiance_clamp_and_sum(kg, L);
+		float3 L_sum = path_radiance_clamp_and_sum(kg, L);
-			kernel_write_light_passes(kg, per_sample_output_buffers, L, sample);
+		kernel_write_light_passes(kg, per_sample_output_buffers, L, sample);
 #ifdef __KERNEL_DEBUG__
-			kernel_write_debug_passes(kg, per_sample_output_buffers, state, debug_data, sample);
+		kernel_write_debug_passes(kg, per_sample_output_buffers, state, debug_data, sample);
 #endif
-			float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent));
+		float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent));
-			/* accumulate result in output buffer */
+		/* accumulate result in output buffer */
-			kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad);
+		kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad);
-			path_rng_end(kg, rng_state, *rng);
+		path_rng_end(kg, rng_state, *rng);
-			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
+		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
 	}
 	if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
 #ifdef __WORK_STEALING__
 		/* We have completed current work; So get next work */
 		int valid_work = get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index);
 		if(!valid_work) {
 			/* If work is invalid, this means no more work is available and the thread may exit */
 			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
 		}
-
+#else
 		if((sample + parallel_samples) >= end_sample) {
 			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
 		}
 #endif
 		if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
 #ifdef __WORK_STEALING__
-			/* We have completed current work; So get next work */
+			work_array[ray_index] = my_work;
-			int valid_work = get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index);
+			/* Get the sample associated with the current work */
-			if(!valid_work) {
+			sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
-				/* If work is invalid, this means no more work is available and the thread may exit */
+			/* Get pixel and tile position associated with current work */
-				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
+			get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index);
-			}
+			my_sample_tile = 0;
 #else
 			if((sample + parallel_samples) >= end_sample) {
 				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
 			}
 #endif
 			if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
 #ifdef __WORK_STEALING__
 				work_array[ray_index] = my_work;
 				/* Get the sample associated with the current work */
 				sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
 				/* Get pixel and tile position associated with current work */
 				get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index);
 				my_sample_tile = 0;
-				/* Remap rng_state according to the current work */
+			/* Remap rng_state according to the current work */
-				rng_state = initial_rng + ((rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride);
+			rng_state = initial_rng + ((rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride);
-				/* Remap per_sample_output_buffers according to the current work */
+			/* Remap per_sample_output_buffers according to the current work */
-				per_sample_output_buffers = initial_per_sample_output_buffers
+			per_sample_output_buffers = initial_per_sample_output_buffers
-											+ (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride;
+				+ (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride;
 #else
-				work_array[ray_index] = sample + parallel_samples;
+			work_array[ray_index] = sample + parallel_samples;
-				sample = work_array[ray_index];
+			sample = work_array[ray_index];
-				/* Get ray position from ray index */
+			/* Get ray position from ray index */
-				pixel_x = sx + ((ray_index / parallel_samples) % sw);
+			pixel_x = sx + ((ray_index / parallel_samples) % sw);
-				pixel_y = sy + ((ray_index / parallel_samples) / sw);
+			pixel_y = sy + ((ray_index / parallel_samples) / sw);
 #endif
-				/* initialize random numbers and ray */
+			/* initialize random numbers and ray */
-				kernel_path_trace_setup(kg, rng_state, sample, pixel_x, pixel_y, rng, ray);
+			kernel_path_trace_setup(kg, rng_state, sample, pixel_x, pixel_y, rng, ray);
-				if(ray->t != 0.0f) {
+			if(ray->t != 0.0f) {
-					/* Initialize throughput, L_transparent, Ray, PathState; These rays proceed with path-iteration*/
+				/* Initialize throughput, L_transparent, Ray, PathState; These rays proceed with path-iteration*/
-					*throughput = make_float3(1.0f, 1.0f, 1.0f);
+				*throughput = make_float3(1.0f, 1.0f, 1.0f);
-					*L_transparent = 0.0f;
+				*L_transparent = 0.0f;
-					path_radiance_init(L, kernel_data.film.use_light_pass);
+				path_radiance_init(L, kernel_data.film.use_light_pass);
-					path_state_init(kg, state, rng, sample, ray);
+				path_state_init(kg, state, rng, sample, ray);
 #ifdef __KERNEL_DEBUG__
-					debug_data_init(debug_data);
+				debug_data_init(debug_data);
 #endif
-					ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-					enqueue_flag = 1;
+				enqueue_flag = 1;
-				} else {
+			} else {
-					/*These rays do not participate in path-iteration */
+				/*These rays do not participate in path-iteration */
-					float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+				float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-					/* accumulate result in output buffer */
+				/* accumulate result in output buffer */
-					kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad);
+				kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad);
-					path_rng_end(kg, rng_state, *rng);
+				path_rng_end(kg, rng_state, *rng);
-					ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
 				}
 			}
 		}
 #ifndef __COMPUTE_DEVICE_GPU__
 	}
-#endif
+	return enqueue_flag;
 	/* Enqueue RAY_REGENERATED rays into QUEUE_ACTIVE_AND_REGENERATED_RAYS; These rays
 	 * will be made active during next SceneIntersectkernel
 	 */
 	enqueue_ray_index_local(ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, enqueue_flag, queuesize, &local_queue_atomics, Queue_data, Queue_index);
 }
--- a/intern/cycles/kernel/split/kernel_data_init.h
+++ b/intern/cycles/kernel/split/kernel_data_init.h
@@ -16,8 +16,7 @@
 #include "kernel_split_common.h"
-/*
+/* Note on kernel_data_initialization kernel
 * Note on kernel_data_initialization kernel
 * This kernel Initializes structures needed in path-iteration kernels.
 * This is the first kernel in ray-tracing logic.
 *
@@ -51,131 +50,130 @@
 * All slots in queues are initialized to queue empty slot;
 * The number of elements in the queues is initialized to 0;
 */
-__kernel void kernel_data_init(
+ccl_device void kernel_data_init(
-	ccl_global char *globals,
+        ccl_global char *globals,
-	ccl_global char *shader_data_sd,                  /* Arguments related to ShaderData */
+        ccl_global char *shader_data_sd,                  /* Arguments related to ShaderData */
-	ccl_global char *shader_data_sd_DL_shadow,     /* Arguments related to ShaderData */
+        ccl_global char *shader_data_sd_DL_shadow,        /* Arguments related to ShaderData */
-	ccl_global float3 *P_sd,
+        ccl_global float3 *P_sd,
-	ccl_global float3 *P_sd_DL_shadow,
+        ccl_global float3 *P_sd_DL_shadow,
-	ccl_global float3 *N_sd,
+        ccl_global float3 *N_sd,
-	ccl_global float3 *N_sd_DL_shadow,
+        ccl_global float3 *N_sd_DL_shadow,
-	ccl_global float3 *Ng_sd,
+        ccl_global float3 *Ng_sd,
-	ccl_global float3 *Ng_sd_DL_shadow,
+        ccl_global float3 *Ng_sd_DL_shadow,
-	ccl_global float3 *I_sd,
+        ccl_global float3 *I_sd,
-	ccl_global float3 *I_sd_DL_shadow,
+        ccl_global float3 *I_sd_DL_shadow,
-	ccl_global int *shader_sd,
+        ccl_global int *shader_sd,
-	ccl_global int *shader_sd_DL_shadow,
+        ccl_global int *shader_sd_DL_shadow,
-	ccl_global int *flag_sd,
+        ccl_global int *flag_sd,
-	ccl_global int *flag_sd_DL_shadow,
+        ccl_global int *flag_sd_DL_shadow,
-	ccl_global int *prim_sd,
+        ccl_global int *prim_sd,
-	ccl_global int *prim_sd_DL_shadow,
+        ccl_global int *prim_sd_DL_shadow,
-	ccl_global int *type_sd,
+        ccl_global int *type_sd,
-	ccl_global int *type_sd_DL_shadow,
+        ccl_global int *type_sd_DL_shadow,
-	ccl_global float *u_sd,
+        ccl_global float *u_sd,
-	ccl_global float *u_sd_DL_shadow,
+        ccl_global float *u_sd_DL_shadow,
-	ccl_global float *v_sd,
+        ccl_global float *v_sd,
-	ccl_global float *v_sd_DL_shadow,
+        ccl_global float *v_sd_DL_shadow,
-	ccl_global int *object_sd,
+        ccl_global int *object_sd,
-	ccl_global int *object_sd_DL_shadow,
+        ccl_global int *object_sd_DL_shadow,
-	ccl_global float *time_sd,
+        ccl_global float *time_sd,
-	ccl_global float *time_sd_DL_shadow,
+        ccl_global float *time_sd_DL_shadow,
-	ccl_global float *ray_length_sd,
+        ccl_global float *ray_length_sd,
-	ccl_global float *ray_length_sd_DL_shadow,
+        ccl_global float *ray_length_sd_DL_shadow,
-	ccl_global int *ray_depth_sd,
+        ccl_global int *ray_depth_sd,
-	ccl_global int *ray_depth_sd_DL_shadow,
+        ccl_global int *ray_depth_sd_DL_shadow,
-	ccl_global int *transparent_depth_sd,
+        ccl_global int *transparent_depth_sd,
-	ccl_global int *transparent_depth_sd_DL_shadow,
+        ccl_global int *transparent_depth_sd_DL_shadow,
-	/* Ray differentials. */
+        /* Ray differentials. */
-	ccl_global differential3 *dP_sd,
+        ccl_global differential3 *dP_sd,
-	ccl_global differential3 *dP_sd_DL_shadow,
+        ccl_global differential3 *dP_sd_DL_shadow,
-	ccl_global differential3 *dI_sd,
+        ccl_global differential3 *dI_sd,
-	ccl_global differential3 *dI_sd_DL_shadow,
+        ccl_global differential3 *dI_sd_DL_shadow,
-	ccl_global differential *du_sd,
+        ccl_global differential *du_sd,
-	ccl_global differential *du_sd_DL_shadow,
+        ccl_global differential *du_sd_DL_shadow,
-	ccl_global differential *dv_sd,
+        ccl_global differential *dv_sd,
-	ccl_global differential *dv_sd_DL_shadow,
+        ccl_global differential *dv_sd_DL_shadow,
-	/* Dp/Du */
+        /* Dp/Du */
-	ccl_global float3 *dPdu_sd,
+        ccl_global float3 *dPdu_sd,
-	ccl_global float3 *dPdu_sd_DL_shadow,
+        ccl_global float3 *dPdu_sd_DL_shadow,
-	ccl_global float3 *dPdv_sd,
+        ccl_global float3 *dPdv_sd,
-	ccl_global float3 *dPdv_sd_DL_shadow,
+        ccl_global float3 *dPdv_sd_DL_shadow,
-	/* Object motion. */
+        /* Object motion. */
-	ccl_global Transform *ob_tfm_sd,
+        ccl_global Transform *ob_tfm_sd,
-	ccl_global Transform *ob_tfm_sd_DL_shadow,
+        ccl_global Transform *ob_tfm_sd_DL_shadow,
-	ccl_global Transform *ob_itfm_sd,
+        ccl_global Transform *ob_itfm_sd,
-	ccl_global Transform *ob_itfm_sd_DL_shadow,
+        ccl_global Transform *ob_itfm_sd_DL_shadow,
-	ShaderClosure *closure_sd,
+        ShaderClosure *closure_sd,
-	ShaderClosure *closure_sd_DL_shadow,
+        ShaderClosure *closure_sd_DL_shadow,
-	ccl_global int *num_closure_sd,
+        ccl_global int *num_closure_sd,
-	ccl_global int *num_closure_sd_DL_shadow,
+        ccl_global int *num_closure_sd_DL_shadow,
-	ccl_global float *randb_closure_sd,
+        ccl_global float *randb_closure_sd,
-	ccl_global float *randb_closure_sd_DL_shadow,
+        ccl_global float *randb_closure_sd_DL_shadow,
-	ccl_global float3 *ray_P_sd,
+        ccl_global float3 *ray_P_sd,
-	ccl_global float3 *ray_P_sd_DL_shadow,
+        ccl_global float3 *ray_P_sd_DL_shadow,
-	ccl_global differential3 *ray_dP_sd,
+        ccl_global differential3 *ray_dP_sd,
-	ccl_global differential3 *ray_dP_sd_DL_shadow,
+        ccl_global differential3 *ray_dP_sd_DL_shadow,
-	ccl_constant KernelData *data,
+        ccl_constant KernelData *data,
-	ccl_global float *per_sample_output_buffers,
+        ccl_global float *per_sample_output_buffers,
-	ccl_global uint *rng_state,
+        ccl_global uint *rng_state,
-	ccl_global uint *rng_coop,                   /* rng array to store rng values for all rays */
+        ccl_global uint *rng_coop,                   /* rng array to store rng values for all rays */
-	ccl_global float3 *throughput_coop,          /* throughput array to store throughput values for all rays */
+        ccl_global float3 *throughput_coop,          /* throughput array to store throughput values for all rays */
-	ccl_global float *L_transparent_coop,        /* L_transparent array to store L_transparent values for all rays */
+        ccl_global float *L_transparent_coop,        /* L_transparent array to store L_transparent values for all rays */
-	PathRadiance *PathRadiance_coop,  /* PathRadiance array to store PathRadiance values for all rays */
+        PathRadiance *PathRadiance_coop,             /* PathRadiance array to store PathRadiance values for all rays */
-	ccl_global Ray *Ray_coop,                    /* Ray array to store Ray information for all rays */
+        ccl_global Ray *Ray_coop,                    /* Ray array to store Ray information for all rays */
-	ccl_global PathState *PathState_coop,        /* PathState array to store PathState information for all rays */
+        ccl_global PathState *PathState_coop,        /* PathState array to store PathState information for all rays */
-	ccl_global char *ray_state,                  /* Stores information on current state of a ray */
+        ccl_global char *ray_state,                  /* Stores information on current state of a ray */
-#define KERNEL_TEX(type, ttype, name) \
+#define KERNEL_TEX(type, ttype, name)                                   \
-	ccl_global type *name,
+        ccl_global type *name,
 #include "../kernel_textures.h"
-	int start_sample, int sx, int sy, int sw, int sh, int offset, int stride,
+        int start_sample, int sx, int sy, int sw, int sh, int offset, int stride,
-	int rng_state_offset_x,
+        int rng_state_offset_x,
-	int rng_state_offset_y,
+        int rng_state_offset_y,
-	int rng_state_stride,
+        int rng_state_stride,
-	ccl_global int *Queue_data,                  /* Memory for queues */
+        ccl_global int *Queue_data,                  /* Memory for queues */
-	ccl_global int *Queue_index,                 /* Tracks the number of elements in queues */
+        ccl_global int *Queue_index,                 /* Tracks the number of elements in queues */
-	int queuesize,                               /* size (capacity) of the queue */
+        int queuesize,                               /* size (capacity) of the queue */
-	ccl_global char *use_queues_flag,            /* flag to decide if scene-intersect kernel should use queues to fetch ray index */
+        ccl_global char *use_queues_flag,            /* flag to decide if scene-intersect kernel should use queues to fetch ray index */
-	ccl_global unsigned int *work_array,         /* work array to store which work each ray belongs to */
+        ccl_global unsigned int *work_array,         /* work array to store which work each ray belongs to */
 #ifdef __WORK_STEALING__
-	ccl_global unsigned int *work_pool_wgs,      /* Work pool for each work group */
+        ccl_global unsigned int *work_pool_wgs,      /* Work pool for each work group */
-	unsigned int num_samples,                    /* Total number of samples per pixel */
+        unsigned int num_samples,                    /* Total number of samples per pixel */
 #endif
 #ifdef __KERNEL_DEBUG__
-	DebugData *debugdata_coop,
+        DebugData *debugdata_coop,
 #endif
-	int parallel_samples                         /* Number of samples to be processed in parallel */
+        int parallel_samples)                        /* Number of samples to be processed in parallel */
 	)
 {
 	/* Load kernel globals structure */
--- a/intern/cycles/kernel/split/kernel_direct_lighting.h
+++ b/intern/cycles/kernel/split/kernel_direct_lighting.h
@@ -16,8 +16,7 @@
 #include "kernel_split_common.h"
-/*
+/* Note on kernel_direct_lighting kernel.
 * Note on kernel_direct_lighting kernel.
 * This is the eighth kernel in the ray tracing logic. This is the seventh
 * of the path iteration kernels. This kernel takes care of direct lighting
 * logic. However, the "shadow ray cast" part of direct lighting is handled
@@ -49,90 +48,58 @@
 * QUEUE_SHADOW_RAY_CAST_DL_RAYS queue will be filled with rays for which a shadow_blocked function must be executed, after this
 * kernel call. Before this kernel call the QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty.
 */
-__kernel void kernel_direct_lighting(
+ccl_device char kernel_direct_lighting(
-	ccl_global char *globals,
+        ccl_global char *globals,
-	ccl_constant KernelData *data,
+        ccl_constant KernelData *data,
-	ccl_global char *shader_data,           /* Required for direct lighting */
+        ccl_global char *shader_data,           /* Required for direct lighting */
-	ccl_global char *shader_DL,             /* Required for direct lighting */
+        ccl_global char *shader_DL,             /* Required for direct lighting */
-	ccl_global uint *rng_coop,              /* Required for direct lighting */
+        ccl_global uint *rng_coop,              /* Required for direct lighting */
-	ccl_global PathState *PathState_coop,   /* Required for direct lighting */
+        ccl_global PathState *PathState_coop,   /* Required for direct lighting */
-	ccl_global int *ISLamp_coop,            /* Required for direct lighting */
+        ccl_global int *ISLamp_coop,            /* Required for direct lighting */
-	ccl_global Ray *LightRay_coop,          /* Required for direct lighting */
+        ccl_global Ray *LightRay_coop,          /* Required for direct lighting */
-	ccl_global BsdfEval *BSDFEval_coop,     /* Required for direct lighting */
+        ccl_global BsdfEval *BSDFEval_coop,     /* Required for direct lighting */
-	ccl_global char *ray_state,             /* Denotes the state of each ray */
+        ccl_global char *ray_state,             /* Denotes the state of each ray */
-	ccl_global int *Queue_data,             /* Queue memory */
+        int ray_index)
 	ccl_global int *Queue_index,            /* Tracks the number of elements in each queue */
 	int queuesize                           /* Size (capacity) of each queue */
 	)
 {
 	ccl_local unsigned int local_queue_atomics;
 	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
 		local_queue_atomics = 0;
 	}
 	barrier(CLK_LOCAL_MEM_FENCE);
 	char enqueue_flag = 0;
-	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-	ray_index = get_ray_index(ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 0);
+		/* Load kernel globals structure and ShaderData structure */
 		KernelGlobals *kg = (KernelGlobals *)globals;
 		ShaderData *sd = (ShaderData *)shader_data;
 		ShaderData *sd_DL  = (ShaderData *)shader_DL;
-#ifdef __COMPUTE_DEVICE_GPU__
+		ccl_global PathState *state = &PathState_coop[ray_index];
 	/* If we are executing on a GPU device, we exit all threads that are not required
 	 * If we are executing on a CPU device, then we need to keep all threads active
 	 * since we have barrier() calls later in the kernel. CPU devices,
 	 * expect all threads to execute barrier statement.
 	 */
 	if(ray_index == QUEUE_EMPTY_SLOT)
 		return;
 #endif
-#ifndef __COMPUTE_DEVICE_GPU__
+		/* direct lighting */
 	if(ray_index != QUEUE_EMPTY_SLOT) {
 #endif
 		if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
 			/* Load kernel globals structure and ShaderData structure */
 			KernelGlobals *kg = (KernelGlobals *)globals;
 			ShaderData *sd = (ShaderData *)shader_data;
 			ShaderData *sd_DL  = (ShaderData *)shader_DL;
 			ccl_global PathState *state = &PathState_coop[ray_index];
 			/* direct lighting */
 #ifdef __EMISSION__
-			if((kernel_data.integrator.use_direct_light && (ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL))) {
+		if((kernel_data.integrator.use_direct_light && (ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL))) {
-				/* sample illumination from lights to find path contribution */
+			/* sample illumination from lights to find path contribution */
-				ccl_global RNG* rng = &rng_coop[ray_index];
+			ccl_global RNG* rng = &rng_coop[ray_index];
-				float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
+			float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
-				float light_u, light_v;
+			float light_u, light_v;
-				path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
+			path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
-				LightSample ls;
+			LightSample ls;
-				light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls);
+			light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls);
-				Ray light_ray;
+			Ray light_ray;
 #ifdef __OBJECT_MOTION__
-				light_ray.time = ccl_fetch(sd, time);
+			light_ray.time = ccl_fetch(sd, time);
 #endif
-				BsdfEval L_light;
+			BsdfEval L_light;
-				bool is_lamp;
+			bool is_lamp;
-				if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce, sd_DL)) {
+			if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce, sd_DL)) {
-					/* write intermediate data to global memory to access from the next kernel */
+				/* write intermediate data to global memory to access from the next kernel */
-					LightRay_coop[ray_index] = light_ray;
+				LightRay_coop[ray_index] = light_ray;
-					BSDFEval_coop[ray_index] = L_light;
+				BSDFEval_coop[ray_index] = L_light;
-					ISLamp_coop[ray_index] = is_lamp;
+				ISLamp_coop[ray_index] = is_lamp;
-					/// mark ray state for next shadow kernel
+				/// mark ray state for next shadow kernel
-					ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
+				ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
-					enqueue_flag = 1;
+				enqueue_flag = 1;
 				}
 			}
 #endif
 		}
-#ifndef __COMPUTE_DEVICE_GPU__
+#endif
 	}
-#endif
+	return enqueue_flag;
 #ifdef __EMISSION__
 	/* Enqueue RAY_SHADOW_RAY_CAST_DL rays */
 	enqueue_ray_index_local(ray_index, QUEUE_SHADOW_RAY_CAST_DL_RAYS, enqueue_flag, queuesize, &local_queue_atomics, Queue_data, Queue_index);
 #endif
 }
--- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
+++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
@@ -16,8 +16,7 @@
 #include "kernel_split_common.h"
-/*
+/* Note on kernel_holdout_emission_blurring_pathtermination_ao kernel.
 * Note on kernel_holdout_emission_blurring_pathtermination_ao kernel.
 * This is the sixth kernel in the ray tracing logic. This is the fifth
 * of the path iteration kernels. This kernel takes care of the logic to process
 * "material of type holdout", indirect primitive emission, bsdf blurring,
@@ -71,213 +70,175 @@
 * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays
 * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with flag RAY_SHADOW_RAY_CAST_AO
 */
-
+ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
-__kernel void kernel_holdout_emission_blurring_pathtermination_ao(
+        ccl_global char *globals,
-	ccl_global char *globals,
+        ccl_constant KernelData *data,
-	ccl_constant KernelData *data,
+        ccl_global char *shader_data,          /* Required throughout the kernel except probabilistic path termination and AO */
-	ccl_global char *shader_data,               /* Required throughout the kernel except probabilistic path termination and AO */
+        ccl_global float *per_sample_output_buffers,
-	ccl_global float *per_sample_output_buffers,
+        ccl_global uint *rng_coop,             /* Required for "kernel_write_data_passes" and AO */
-	ccl_global uint *rng_coop,                  /* Required for "kernel_write_data_passes" and AO */
+        ccl_global float3 *throughput_coop,    /* Required for handling holdout material and AO */
-	ccl_global float3 *throughput_coop,         /* Required for handling holdout material and AO */
+        ccl_global float *L_transparent_coop,  /* Required for handling holdout material */
-	ccl_global float *L_transparent_coop,       /* Required for handling holdout material */
+        PathRadiance *PathRadiance_coop,       /* Required for "kernel_write_data_passes" and indirect primitive emission */
-	PathRadiance *PathRadiance_coop, /* Required for "kernel_write_data_passes" and indirect primitive emission */
+        ccl_global PathState *PathState_coop,  /* Required throughout the kernel and AO */
-	ccl_global PathState *PathState_coop,       /* Required throughout the kernel and AO */
+        Intersection *Intersection_coop,       /* Required for indirect primitive emission */
-	Intersection *Intersection_coop, /* Required for indirect primitive emission */
+        ccl_global float3 *AOAlpha_coop,       /* Required for AO */
-	ccl_global float3 *AOAlpha_coop,            /* Required for AO */
+        ccl_global float3 *AOBSDF_coop,        /* Required for AO */
-	ccl_global float3 *AOBSDF_coop,             /* Required for AO */
+        ccl_global Ray *AOLightRay_coop,       /* Required for AO */
-	ccl_global Ray *AOLightRay_coop,            /* Required for AO */
+        int sw, int sh, int sx, int sy, int stride,
-	int sw, int sh, int sx, int sy, int stride,
+        ccl_global char *ray_state,            /* Denotes the state of each ray */
-	ccl_global char *ray_state,                /* Denotes the state of each ray */
+        ccl_global unsigned int *work_array,   /* Denotes the work that each ray belongs to */
 	ccl_global unsigned int *work_array,       /* Denotes the work that each ray belongs to */
 	ccl_global int *Queue_data,                /* Queue memory */
 	ccl_global int *Queue_index,               /* Tracks the number of elements in each queue */
 	int queuesize,                             /* Size (capacity) of each queue */
 #ifdef __WORK_STEALING__
-	unsigned int start_sample,
+        unsigned int start_sample,
 #endif
-	int parallel_samples                       /* Number of samples to be processed in parallel */
+        int parallel_samples,                  /* Number of samples to be processed in parallel */
-	)
+        int ray_index,
        char *enqueue_flag,
        char *enqueue_flag_AO_SHADOW_RAY_CAST)
 {
-	ccl_local unsigned int local_queue_atomics_bg;
+	/* Load kernel globals structure and ShaderData structure */
-	ccl_local unsigned int local_queue_atomics_ao;
+	KernelGlobals *kg = (KernelGlobals *)globals;
-	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+	ShaderData *sd = (ShaderData *)shader_data;
 		local_queue_atomics_bg = 0;
 		local_queue_atomics_ao = 0;
 	}
 	barrier(CLK_LOCAL_MEM_FENCE);
 	char enqueue_flag = 0;
 	char enqueue_flag_AO_SHADOW_RAY_CAST = 0;
 	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
 	ray_index = get_ray_index(ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 0);
 #ifdef __COMPUTE_DEVICE_GPU__
 	/* If we are executing on a GPU device, we exit all threads that are not required
 	 * If we are executing on a CPU device, then we need to keep all threads active
 	 * since we have barrier() calls later in the kernel. CPU devices
 	 * expect all threads to execute barrier statement.
 	 */
 	if(ray_index == QUEUE_EMPTY_SLOT)
 		return;
 #endif
 #ifndef __COMPUTE_DEVICE_GPU__
 	if(ray_index != QUEUE_EMPTY_SLOT) {
 #endif
 		/* Load kernel globals structure and ShaderData structure */
 		KernelGlobals *kg = (KernelGlobals *)globals;
 		ShaderData *sd = (ShaderData *)shader_data;
 #ifdef __WORK_STEALING__
-		unsigned int my_work;
+	unsigned int my_work;
-		unsigned int pixel_x;
+	unsigned int pixel_x;
-		unsigned int pixel_y;
+	unsigned int pixel_y;
 #endif
-		unsigned int tile_x;
+	unsigned int tile_x;
-		unsigned int tile_y;
+	unsigned int tile_y;
-		int my_sample_tile;
+	int my_sample_tile;
-		unsigned int sample;
+	unsigned int sample;
-		ccl_global RNG *rng = 0x0;
+	ccl_global RNG *rng = 0x0;
-		ccl_global PathState *state = 0x0;
+	ccl_global PathState *state = 0x0;
-		float3 throughput;
+	float3 throughput;
-		if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-			throughput = throughput_coop[ray_index];
+		throughput = throughput_coop[ray_index];
-			state = &PathState_coop[ray_index];
+		state = &PathState_coop[ray_index];
-			rng = &rng_coop[ray_index];
+		rng = &rng_coop[ray_index];
 #ifdef __WORK_STEALING__
-			my_work = work_array[ray_index];
+		my_work = work_array[ray_index];
-			sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
+		sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
-			get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index);
+		get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index);
-			my_sample_tile = 0;
+		my_sample_tile = 0;
 #else // __WORK_STEALING__
-			sample = work_array[ray_index];
+		sample = work_array[ray_index];
-			/* buffer's stride is "stride"; Find x and y using ray_index */
+		/* buffer's stride is "stride"; Find x and y using ray_index */
-			int tile_index = ray_index / parallel_samples;
+		int tile_index = ray_index / parallel_samples;
-			tile_x = tile_index % sw;
+		tile_x = tile_index % sw;
-			tile_y = tile_index / sw;
+		tile_y = tile_index / sw;
-			my_sample_tile = ray_index - (tile_index * parallel_samples);
+		my_sample_tile = ray_index - (tile_index * parallel_samples);
 #endif // __WORK_STEALING__
-			per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride;
+		per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride;
-			/* holdout */
+		/* holdout */
 #ifdef __HOLDOUT__
-			if((ccl_fetch(sd, flag) & (SD_HOLDOUT|SD_HOLDOUT_MASK)) && (state->flag & PATH_RAY_CAMERA)) {
+		if((ccl_fetch(sd, flag) & (SD_HOLDOUT|SD_HOLDOUT_MASK)) && (state->flag & PATH_RAY_CAMERA)) {
-				if(kernel_data.background.transparent) {
+			if(kernel_data.background.transparent) {
-					float3 holdout_weight;
+				float3 holdout_weight;
-					if(ccl_fetch(sd, flag) & SD_HOLDOUT_MASK)
+				if(ccl_fetch(sd, flag) & SD_HOLDOUT_MASK)
-						holdout_weight = make_float3(1.0f, 1.0f, 1.0f);
+					holdout_weight = make_float3(1.0f, 1.0f, 1.0f);
-					else
+				else
-						holdout_weight = shader_holdout_eval(kg, sd);
+					holdout_weight = shader_holdout_eval(kg, sd);
-					/* any throughput is ok, should all be identical here */
+				/* any throughput is ok, should all be identical here */
-					L_transparent_coop[ray_index] += average(holdout_weight*throughput);
+				L_transparent_coop[ray_index] += average(holdout_weight*throughput);
-				}
+			}
-
+
-				if(ccl_fetch(sd, flag) & SD_HOLDOUT_MASK) {
+			if(ccl_fetch(sd, flag) & SD_HOLDOUT_MASK) {
-					ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-					enqueue_flag = 1;
+				*enqueue_flag = 1;
 				}
 			}
 #endif
 		}
 #endif
 	}
-		if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-			PathRadiance *L = &PathRadiance_coop[ray_index];
+		PathRadiance *L = &PathRadiance_coop[ray_index];
-			/* holdout mask objects do not write data passes */
+		/* holdout mask objects do not write data passes */
-			kernel_write_data_passes(kg, per_sample_output_buffers, L, sd, sample, state, throughput);
+		kernel_write_data_passes(kg, per_sample_output_buffers, L, sd, sample, state, throughput);
-			/* blurring of bsdf after bounces, for rays that have a small likelihood
+		/* blurring of bsdf after bounces, for rays that have a small likelihood
-				* of following this particular path (diffuse, rough glossy) */
+		 * of following this particular path (diffuse, rough glossy) */
-			if(kernel_data.integrator.filter_glossy != FLT_MAX) {
+		if(kernel_data.integrator.filter_glossy != FLT_MAX) {
-				float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf;
+			float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf;
-				if(blur_pdf < 1.0f) {
+			if(blur_pdf < 1.0f) {
-					float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f;
+				float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f;
-					shader_bsdf_blur(kg, sd, blur_roughness);
+				shader_bsdf_blur(kg, sd, blur_roughness);
 				}
 			}
 		}
 #ifdef __EMISSION__
-			/* emission */
+		/* emission */
-			if(ccl_fetch(sd, flag) & SD_EMISSION) {
+		if(ccl_fetch(sd, flag) & SD_EMISSION) {
-				/* todo: is isect.t wrong here for transparent surfaces? */
+			/* todo: is isect.t wrong here for transparent surfaces? */
-				float3 emission = indirect_primitive_emission(kg, sd, Intersection_coop[ray_index].t, state->flag, state->ray_pdf);
+			float3 emission = indirect_primitive_emission(kg, sd, Intersection_coop[ray_index].t, state->flag, state->ray_pdf);
-				path_radiance_accum_emission(L, throughput, emission, state->bounce);
+			path_radiance_accum_emission(L, throughput, emission, state->bounce);
-			}
+		}
 #endif
-			/* path termination. this is a strange place to put the termination, it's
+		/* path termination. this is a strange place to put the termination, it's
-			 * mainly due to the mixed in MIS that we use. gives too many unneeded
+		 * mainly due to the mixed in MIS that we use. gives too many unneeded
-			 * shader evaluations, only need emission if we are going to terminate */
+		 * shader evaluations, only need emission if we are going to terminate */
-			float probability = path_state_terminate_probability(kg, state, throughput);
+		float probability = path_state_terminate_probability(kg, state, throughput);
-			if(probability == 0.0f) {
+		if(probability == 0.0f) {
-				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-				enqueue_flag = 1;
+			*enqueue_flag = 1;
-			}
+		}
-			if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+		if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-				if(probability != 1.0f) {
+			if(probability != 1.0f) {
-					float terminate = path_state_rng_1D_for_decision(kg, rng, state, PRNG_TERMINATE);
+				float terminate = path_state_rng_1D_for_decision(kg, rng, state, PRNG_TERMINATE);
-					if(terminate >= probability) {
+				if(terminate >= probability) {
-						ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+					ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-						enqueue_flag = 1;
+					*enqueue_flag = 1;
-					} else {
+				} else {
-						throughput_coop[ray_index] = throughput/probability;
+					throughput_coop[ray_index] = throughput/probability;
 					}
 				}
 			}
 		}
 	}
 #ifdef __AO__
-		if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-			/* ambient occlusion */
+		/* ambient occlusion */
-			if(kernel_data.integrator.use_ambient_occlusion || (ccl_fetch(sd, flag) & SD_AO)) {
+		if(kernel_data.integrator.use_ambient_occlusion || (ccl_fetch(sd, flag) & SD_AO)) {
-				/* todo: solve correlation */
+			/* todo: solve correlation */
-				float bsdf_u, bsdf_v;
+			float bsdf_u, bsdf_v;
-				path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+			path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-				float ao_factor = kernel_data.background.ao_factor;
+			float ao_factor = kernel_data.background.ao_factor;
-				float3 ao_N;
+			float3 ao_N;
-				AOBSDF_coop[ray_index] = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
+			AOBSDF_coop[ray_index] = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
-				AOAlpha_coop[ray_index] = shader_bsdf_alpha(kg, sd);
+			AOAlpha_coop[ray_index] = shader_bsdf_alpha(kg, sd);
-				float3 ao_D;
+			float3 ao_D;
-				float ao_pdf;
+			float ao_pdf;
-				sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
+			sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
-				if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
+			if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
-					Ray _ray;
+				Ray _ray;
-					_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
+				_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
-					_ray.D = ao_D;
+				_ray.D = ao_D;
-					_ray.t = kernel_data.background.ao_distance;
+				_ray.t = kernel_data.background.ao_distance;
 #ifdef __OBJECT_MOTION__
-					_ray.time = ccl_fetch(sd, time);
+				_ray.time = ccl_fetch(sd, time);
 #endif
-					_ray.dP = ccl_fetch(sd, dP);
+				_ray.dP = ccl_fetch(sd, dP);
-					_ray.dD = differential3_zero();
+				_ray.dD = differential3_zero();
-					AOLightRay_coop[ray_index] = _ray;
+				AOLightRay_coop[ray_index] = _ray;
-					ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
+				ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
-					enqueue_flag_AO_SHADOW_RAY_CAST = 1;
+				*enqueue_flag_AO_SHADOW_RAY_CAST = 1;
 				}
 			}
 		}
 #endif
 #ifndef __COMPUTE_DEVICE_GPU__
 	}
 #endif
 	/* Enqueue RAY_UPDATE_BUFFER rays */
 	enqueue_ray_index_local(ray_index, QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, enqueue_flag, queuesize, &local_queue_atomics_bg, Queue_data, Queue_index);
 #ifdef __AO__
 	/* Enqueue to-shadow-ray-cast rays */
 	enqueue_ray_index_local(ray_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS, enqueue_flag_AO_SHADOW_RAY_CAST, queuesize, &local_queue_atomics_ao, Queue_data, Queue_index);
 #endif
 }
--- a/intern/cycles/kernel/split/kernel_lamp_emission.h
+++ b/intern/cycles/kernel/split/kernel_lamp_emission.h
@@ -16,8 +16,7 @@
 #include "kernel_split_common.h"
-/*
+/* Note on kernel_lamp_emission
 * Note on kernel_lamp_emission
 * This is the 3rd kernel in the ray-tracing logic. This is the second of the
 * path-iteration kernels. This kernel takes care of the indirect lamp emission logic.
 * This kernel operates on QUEUE_ACTIVE_AND_REGENERATED_RAYS. It processes rays of state RAY_ACTIVE
@@ -40,55 +39,23 @@
 *
 * note : shader_data is neither input nor output. Its just filled and consumed in the same, kernel_lamp_emission, kernel.
 */
-__kernel void kernel_lamp_emission(
+ccl_device void kernel_lamp_emission(
-	ccl_global char *globals,
+        ccl_global char *globals,
-	ccl_constant KernelData *data,
+        ccl_constant KernelData *data,
-	ccl_global char *shader_data,               /* Required for lamp emission */
+        ccl_global char *shader_data,          /* Required for lamp emission */
-	ccl_global float3 *throughput_coop,         /* Required for lamp emission */
+        ccl_global float3 *throughput_coop,    /* Required for lamp emission */
-	PathRadiance *PathRadiance_coop, /* Required for lamp emission */
+        PathRadiance *PathRadiance_coop,       /* Required for lamp emission */
-	ccl_global Ray *Ray_coop,                   /* Required for lamp emission */
+        ccl_global Ray *Ray_coop,              /* Required for lamp emission */
-	ccl_global PathState *PathState_coop,       /* Required for lamp emission */
+        ccl_global PathState *PathState_coop,  /* Required for lamp emission */
-	Intersection *Intersection_coop, /* Required for lamp emission */
+        Intersection *Intersection_coop,       /* Required for lamp emission */
-	ccl_global char *ray_state,                 /* Denotes the state of each ray */
+        ccl_global char *ray_state,            /* Denotes the state of each ray */
-	int sw, int sh,
+        int sw, int sh,
-	ccl_global int *Queue_data,                 /* Memory for queues */
+        ccl_global char *use_queues_flag,      /* Used to decide if this kernel should use
-	ccl_global int *Queue_index,                /* Tracks the number of elements in queues */
+                                                * queues to fetch ray index
-	int queuesize,                              /* Size (capacity) of queues */
+                                                */
-	ccl_global char *use_queues_flag,           /* used to decide if this kernel should use queues to fetch ray index */
+        int parallel_samples,                  /* Number of samples to be processed in parallel */
-	int parallel_samples                        /* Number of samples to be processed in parallel */
+        int ray_index)
 	)
 {
 	int x = get_global_id(0);
 	int y = get_global_id(1);
 	/* We will empty this queue in this kernel */
 	if(get_global_id(0) == 0 && get_global_id(1) == 0) {
 		Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
 	}
 	/* Fetch use_queues_flag */
 	ccl_local char local_use_queues_flag;
 	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
 		local_use_queues_flag = use_queues_flag[0];
 	}
 	barrier(CLK_LOCAL_MEM_FENCE);
 	int ray_index;
 	if(local_use_queues_flag) {
 		int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
 		ray_index = get_ray_index(thread_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 1);
 		if(ray_index == QUEUE_EMPTY_SLOT) {
 			return;
 		}
 	} else {
 		if(x < (sw * parallel_samples) && y < sh){
 			ray_index = x + y * (sw * parallel_samples);
 		} else {
 			return;
 		}
 	}
 	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE) || IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
 		KernelGlobals *kg = (KernelGlobals *)globals;
 		ShaderData *sd = (ShaderData *)shader_data;
--- a/intern/cycles/kernel/split/kernel_next_iteration_setup.h
+++ b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
@@ -16,8 +16,7 @@
 #include "kernel_split_common.h"
-/*
+/* Note on kernel_setup_next_iteration kernel.
 * Note on kernel_setup_next_iteration kernel.
 * This is the tenth kernel in the ray tracing logic. This is the ninth
 * of the path iteration kernels. This kernel takes care of setting up
 * Ray for the next iteration of path-iteration and accumulating radiance
@@ -60,117 +59,74 @@
 * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED and more RAY_UPDATE_BUFFER rays.
 * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays
 */
-
+ccl_device char kernel_next_iteration_setup(
-__kernel void kernel_next_iteration_setup(
+        ccl_global char *globals,
-	ccl_global char *globals,
+        ccl_constant KernelData *data,
-	ccl_constant KernelData *data,
+        ccl_global char *shader_data,         /* Required for setting up ray for next iteration */
-	ccl_global char *shader_data,               /* Required for setting up ray for next iteration */
+        ccl_global uint *rng_coop,            /* Required for setting up ray for next iteration */
-	ccl_global uint *rng_coop,                  /* Required for setting up ray for next iteration */
+        ccl_global float3 *throughput_coop,   /* Required for setting up ray for next iteration */
-	ccl_global float3 *throughput_coop,         /* Required for setting up ray for next iteration */
+        PathRadiance *PathRadiance_coop,      /* Required for setting up ray for next iteration */
-	PathRadiance *PathRadiance_coop, /* Required for setting up ray for next iteration */
+        ccl_global Ray *Ray_coop,             /* Required for setting up ray for next iteration */
-	ccl_global Ray *Ray_coop,                   /* Required for setting up ray for next iteration */
+        ccl_global PathState *PathState_coop, /* Required for setting up ray for next iteration */
-	ccl_global PathState *PathState_coop,       /* Required for setting up ray for next iteration */
+        ccl_global Ray *LightRay_dl_coop,     /* Required for radiance update - direct lighting */
-	ccl_global Ray *LightRay_dl_coop,           /* Required for radiance update - direct lighting */
+        ccl_global int *ISLamp_coop,          /* Required for radiance update - direct lighting */
-	ccl_global int *ISLamp_coop,                /* Required for radiance update - direct lighting */
+        ccl_global BsdfEval *BSDFEval_coop,   /* Required for radiance update - direct lighting */
-	ccl_global BsdfEval *BSDFEval_coop,         /* Required for radiance update - direct lighting */
+        ccl_global Ray *LightRay_ao_coop,     /* Required for radiance update - AO */
-	ccl_global Ray *LightRay_ao_coop,           /* Required for radiance update - AO */
+        ccl_global float3 *AOBSDF_coop,       /* Required for radiance update - AO */
-	ccl_global float3 *AOBSDF_coop,             /* Required for radiance update - AO */
+        ccl_global float3 *AOAlpha_coop,      /* Required for radiance update - AO */
-	ccl_global float3 *AOAlpha_coop,            /* Required for radiance update - AO */
+        ccl_global char *ray_state,           /* Denotes the state of each ray */
-	ccl_global char *ray_state,                 /* Denotes the state of each ray */
+        ccl_global char *use_queues_flag,     /* flag to decide if scene_intersect kernel should
-	ccl_global int *Queue_data,                 /* Queue memory */
+                                               * use queues to fetch ray index */
-	ccl_global int *Queue_index,                /* Tracks the number of elements in each queue */
+        int ray_index)
 	int queuesize,                              /* Size (capacity) of each queue */
 	ccl_global char *use_queues_flag            /* flag to decide if scene_intersect kernel should use queues to fetch ray index */
 	)
 {
 	ccl_local unsigned int local_queue_atomics;
 	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
 		local_queue_atomics = 0;
 	}
 	barrier(CLK_LOCAL_MEM_FENCE);
 	if(get_global_id(0) == 0 && get_global_id(1) == 0) {
 		/* If we are here, then it means that scene-intersect kernel
 		* has already been executed atleast once. From the next time,
 		* scene-intersect kernel may operate on queues to fetch ray index
 		*/
 		use_queues_flag[0] = 1;
 		/* Mark queue indices of QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS
 		 * queues that were made empty during the previous kernel
 		 */
 		Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
 		Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
 	}
 	char enqueue_flag = 0;
 	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
 	ray_index = get_ray_index(ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 0);
-#ifdef __COMPUTE_DEVICE_GPU__
+	/* Load kernel globals structure and ShaderData structure */
-	/* If we are executing on a GPU device, we exit all threads that are not required
+	KernelGlobals *kg = (KernelGlobals *)globals;
-	 * If we are executing on a CPU device, then we need to keep all threads active
+	ShaderData *sd = (ShaderData *)shader_data;
-	 * since we have barrier() calls later in the kernel. CPU devices,
+	PathRadiance *L = 0x0;
-	 * expect all threads to execute barrier statement.
+	ccl_global PathState *state = 0x0;
 	 */
 	if(ray_index == QUEUE_EMPTY_SLOT)
 		return;
 #endif
-#ifndef __COMPUTE_DEVICE_GPU__
+	/* Path radiance update for AO/Direct_lighting's shadow blocked */
-	if(ray_index != QUEUE_EMPTY_SLOT) {
+	if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) || IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) {
-#endif
+		state = &PathState_coop[ray_index];
-		/* Load kernel globals structure and ShaderData structure */
+		L = &PathRadiance_coop[ray_index];
-		KernelGlobals *kg = (KernelGlobals *)globals;
+		float3 _throughput = throughput_coop[ray_index];
 		ShaderData *sd = (ShaderData *)shader_data;
 		PathRadiance *L = 0x0;
 		ccl_global PathState *state = 0x0;
-		/* Path radiance update for AO/Direct_lighting's shadow blocked */
+		if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) {
-		if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) || IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) {
+			float3 shadow = LightRay_ao_coop[ray_index].P;
-			state = &PathState_coop[ray_index];
+			char update_path_radiance = LightRay_ao_coop[ray_index].t;
-			L = &PathRadiance_coop[ray_index];
+			if(update_path_radiance) {
-			float3 _throughput = throughput_coop[ray_index];
+				path_radiance_accum_ao(L, _throughput, AOAlpha_coop[ray_index], AOBSDF_coop[ray_index], shadow, state->bounce);
 			if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) {
 				float3 shadow = LightRay_ao_coop[ray_index].P;
 				char update_path_radiance = LightRay_ao_coop[ray_index].t;
 				if(update_path_radiance) {
 					path_radiance_accum_ao(L, _throughput, AOAlpha_coop[ray_index], AOBSDF_coop[ray_index], shadow, state->bounce);
 				}
 				REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
 			}
 			if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) {
 				float3 shadow = LightRay_dl_coop[ray_index].P;
 				char update_path_radiance = LightRay_dl_coop[ray_index].t;
 				if(update_path_radiance) {
 					BsdfEval L_light = BSDFEval_coop[ray_index];
 					path_radiance_accum_light(L, _throughput, &L_light, shadow, 1.0f, state->bounce, ISLamp_coop[ray_index]);
 				}
 				REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
 			}
 			REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
 		}
-		if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+		if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) {
-
+			float3 shadow = LightRay_dl_coop[ray_index].P;
-			ccl_global float3 *throughput = &throughput_coop[ray_index];
+			char update_path_radiance = LightRay_dl_coop[ray_index].t;
-			ccl_global Ray *ray = &Ray_coop[ray_index];
+			if(update_path_radiance) {
-			ccl_global RNG* rng = &rng_coop[ray_index];
+				BsdfEval L_light = BSDFEval_coop[ray_index];
-			state = &PathState_coop[ray_index];
+				path_radiance_accum_light(L, _throughput, &L_light, shadow, 1.0f, state->bounce, ISLamp_coop[ray_index]);
 			L = &PathRadiance_coop[ray_index];
 			/* compute direct lighting and next bounce */
 			if(!kernel_path_surface_bounce(kg, rng, sd, throughput, state, L, ray)) {
 				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
 				enqueue_flag = 1;
 			}
 			REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
 		}
 #ifndef __COMPUTE_DEVICE_GPU__
 	}
 #endif
-	/* Enqueue RAY_UPDATE_BUFFER rays */
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-	enqueue_ray_index_local(ray_index, QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, enqueue_flag, queuesize, &local_queue_atomics, Queue_data, Queue_index);
+
 		ccl_global float3 *throughput = &throughput_coop[ray_index];
 		ccl_global Ray *ray = &Ray_coop[ray_index];
 		ccl_global RNG* rng = &rng_coop[ray_index];
 		state = &PathState_coop[ray_index];
 		L = &PathRadiance_coop[ray_index];
 		/* compute direct lighting and next bounce */
 		if(!kernel_path_surface_bounce(kg, rng, sd, throughput, state, L, ray)) {
 			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
 			enqueue_flag = 1;
 		}
 	}
 	return enqueue_flag;
 }
--- a/intern/cycles/kernel/split/kernel_queue_enqueue.h
+++ b/intern/cycles/kernel/split/kernel_queue_enqueue.h
@@ -1,98 +0,0 @@
 /*
 * Copyright 2011-2015 Blender Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "../kernel_compat_opencl.h"
 #include "../kernel_math.h"
 #include "../kernel_types.h"
 #include "../kernel_globals.h"
 #include "../kernel_queues.h"
 /*
 * The kernel "kernel_queue_enqueue" enqueues rays of
 * different ray state into their appropriate Queues;
 * 1. Rays that have been determined to hit the background from the
 * "kernel_scene_intersect" kernel
 * are enqueued in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
 * 2. Rays that have been determined to be actively participating in path-iteration will be enqueued into QUEUE_ACTIVE_AND_REGENERATED_RAYS.
 *
 * The input and output of the kernel is as follows,
 *
 * ray_state -------------------------------------------|--- kernel_queue_enqueue --|--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
 * Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------|                           |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
 * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---|                           |
 * queuesize -------------------------------------------|                           |
 *
 * Note on Queues :
 * State of queues during the first time this kernel is called :
 * At entry,
 * Both QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.
 * At exit,
 * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays
 * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_HIT_BACKGROUND rays.
 *
 * State of queue during other times this kernel is called :
 * At entry,
 * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be empty.
 * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will contain RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays.
 * At exit,
 * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays.
 * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays.
 */
 __kernel void kernel_queue_enqueue(
 	ccl_global int *Queue_data,   /* Queue memory */
 	ccl_global int *Queue_index,  /* Tracks the number of elements in each queue */
 	ccl_global char *ray_state,   /* Denotes the state of each ray */
 	int queuesize                 /* Size (capacity) of each queue */
 	)
 {
 	/* We have only 2 cases (Hit/Not-Hit) */
 	ccl_local unsigned int local_queue_atomics[2];
 	int lidx = get_local_id(1) * get_local_size(0) + get_local_id(0);
 	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
 	if(lidx < 2 ) {
 		local_queue_atomics[lidx] = 0;
 	}
 	barrier(CLK_LOCAL_MEM_FENCE);
 	int queue_number = -1;
 	if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
 		queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
 	} else if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
 		queue_number = QUEUE_ACTIVE_AND_REGENERATED_RAYS;
 	}
 	unsigned int my_lqidx;
 	if(queue_number != -1) {
 		my_lqidx = get_local_queue_index(queue_number, local_queue_atomics);
 	}
 	barrier(CLK_LOCAL_MEM_FENCE);
 	if(lidx == 0) {
 		local_queue_atomics[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = get_global_per_queue_offset(QUEUE_ACTIVE_AND_REGENERATED_RAYS, local_queue_atomics, Queue_index);
 		local_queue_atomics[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = get_global_per_queue_offset(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, local_queue_atomics, Queue_index);
 	}
 	barrier(CLK_LOCAL_MEM_FENCE);
 	unsigned int my_gqidx;
 	if(queue_number != -1) {
 		my_gqidx = get_global_queue_index(queue_number, queuesize, my_lqidx, local_queue_atomics);
 		Queue_data[my_gqidx] = ray_index;
 	}
 }
--- a/intern/cycles/kernel/split/kernel_scene_intersect.h
+++ b/intern/cycles/kernel/split/kernel_scene_intersect.h
@@ -16,8 +16,7 @@
 #include "kernel_split_common.h"
-/*
+/* Note on kernel_scene_intersect kernel.
 * Note on kernel_scene_intersect kernel.
 * This is the second kernel in the ray tracing logic. This is the first
 * of the path iteration kernels. This kernel takes care of scene_intersect function.
 *
@@ -63,51 +62,23 @@
 * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS - no change
 */
-__kernel void kernel_scene_intersect(
+ccl_device void kernel_scene_intersect(
-	ccl_global char *globals,
+        ccl_global char *globals,
-	ccl_constant KernelData *data,
+        ccl_constant KernelData *data,
-	ccl_global uint *rng_coop,
+        ccl_global uint *rng_coop,
-	ccl_global Ray *Ray_coop,                   /* Required for scene_intersect */
+        ccl_global Ray *Ray_coop,              /* Required for scene_intersect */
-	ccl_global PathState *PathState_coop,       /* Required for scene_intersect */
+        ccl_global PathState *PathState_coop,  /* Required for scene_intersect */
-	Intersection *Intersection_coop,            /* Required for scene_intersect */
+        Intersection *Intersection_coop,       /* Required for scene_intersect */
-	ccl_global char *ray_state,                 /* Denotes the state of each ray */
+        ccl_global char *ray_state,            /* Denotes the state of each ray */
-	int sw, int sh,
+        int sw, int sh,
-	ccl_global int *Queue_data,                 /* Memory for queues */
+        ccl_global char *use_queues_flag,      /* used to decide if this kernel should use
-	ccl_global int *Queue_index,                /* Tracks the number of elements in queues */
+                                                * queues to fetch ray index */
 	int queuesize,                              /* Size (capacity) of queues */
 	ccl_global char *use_queues_flag,           /* used to decide if this kernel should use queues to fetch ray index */
 #ifdef __KERNEL_DEBUG__
-	DebugData *debugdata_coop,
+        DebugData *debugdata_coop,
 #endif
-	int parallel_samples                        /* Number of samples to be processed in parallel */
+        int parallel_samples,                  /* Number of samples to be processed in parallel */
-	)
+        int ray_index)
 {
 	int x = get_global_id(0);
 	int y = get_global_id(1);
 	/* Fetch use_queues_flag */
 	ccl_local char local_use_queues_flag;
 	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
 		local_use_queues_flag = use_queues_flag[0];
 	}
 	barrier(CLK_LOCAL_MEM_FENCE);
 	int ray_index;
 	if(local_use_queues_flag) {
 		int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
 		ray_index = get_ray_index(thread_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 0);
 		if(ray_index == QUEUE_EMPTY_SLOT) {
 			return;
 		}
 	} else {
 		if(x < (sw * parallel_samples) && y < sh){
 			ray_index = x + y * (sw * parallel_samples);
 		} else {
 			return;
 		}
 	}
 	/* All regenerated rays become active here */
 	if(IS_STATE(ray_state, ray_index, RAY_REGENERATED))
 		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_ACTIVE);
--- a/intern/cycles/kernel/split/kernel_shader_eval.h
+++ b/intern/cycles/kernel/split/kernel_shader_eval.h
@@ -16,8 +16,7 @@
 #include "kernel_split_common.h"
-/*
+/* Note on kernel_shader_eval kernel
 * Note on kernel_shader_eval kernel
 * This kernel is the 5th kernel in the ray tracing logic. This is
 * the 4rd kernel in path iteration. This kernel sets up the ShaderData
 * structure from the values computed by the previous kernels. It also identifies
@@ -45,39 +44,17 @@
 * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
 * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE rays
 */
-
+ccl_device void kernel_shader_eval(
-__kernel void kernel_shader_eval(
+        ccl_global char *globals,
-	ccl_global char *globals,
+        ccl_constant KernelData *data,
-	ccl_constant KernelData *data,
+        ccl_global char *shader_data,          /* Output ShaderData structure to be filled */
-	ccl_global char *shader_data,               /* Output ShaderData structure to be filled */
+        ccl_global uint *rng_coop,             /* Required for rbsdf calculation */
-	ccl_global uint *rng_coop,                  /* Required for rbsdf calculation */
+        ccl_global Ray *Ray_coop,              /* Required for setting up shader from ray */
-	ccl_global Ray *Ray_coop,                   /* Required for setting up shader from ray */
+        ccl_global PathState *PathState_coop,  /* Required for all functions in this kernel */
-	ccl_global PathState *PathState_coop,       /* Required for all functions in this kernel */
+        Intersection *Intersection_coop,       /* Required for setting up shader from ray */
-	Intersection *Intersection_coop, /* Required for setting up shader from ray */
+        ccl_global char *ray_state,            /* Denotes the state of each ray */
-	ccl_global char *ray_state,                 /* Denotes the state of each ray */
+        int ray_index)
 	ccl_global int *Queue_data,                 /* queue memory */
 	ccl_global int *Queue_index,                /* Tracks the number of elements in each queue */
 	int queuesize                               /* Size (capacity) of each queue */
 	)
 {
 	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
 	/* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue */
 	ccl_local unsigned int local_queue_atomics;
 	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
 		local_queue_atomics = 0;
 	}
 	barrier(CLK_LOCAL_MEM_FENCE);
 	char enqueue_flag = (IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 : 0;
 	enqueue_ray_index_local(ray_index, QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, enqueue_flag, queuesize, &local_queue_atomics, Queue_data, Queue_index);
 	ray_index = get_ray_index(ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 0);
 	if(ray_index == QUEUE_EMPTY_SLOT)
 		return;
 	/* Continue on with shader evaluation */
 	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
 		KernelGlobals *kg = (KernelGlobals *)globals;
 		ShaderData *sd = (ShaderData *)shader_data;
--- a/intern/cycles/kernel/split/kernel_shadow_blocked.h
+++ b/intern/cycles/kernel/split/kernel_shadow_blocked.h
@@ -16,8 +16,7 @@
 #include "kernel_split_common.h"
-/*
+/* Note on kernel_shadow_blocked kernel.
 * Note on kernel_shadow_blocked kernel.
 * This is the ninth kernel in the ray tracing logic. This is the eighth
 * of the path iteration kernels. This kernel takes care of "shadow ray cast"
 * logic of the direct lighting and AO  part of ray tracing.
@@ -29,9 +28,9 @@
 * LightRay_ao_coop --------------------------------|                            |--- LightRay_ao_coop
 * ray_state ---------------------------------------|                            |--- ray_state
 * Queue_data(QUEUE_SHADOW_RAY_CAST_AO_RAYS &       |                            |--- Queue_data (QUEUE_SHADOW_RAY_CAST_AO_RAYS & QUEUE_SHADOW_RAY_CAST_AO_RAYS)
-	      QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------|                            |
+              QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------|                            |
 * Queue_index(QUEUE_SHADOW_RAY_CAST_AO_RAYS&
-	      QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------|                            |
+              QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------|                            |
 * kg (globals + data) -----------------------------|                            |
 * queuesize ---------------------------------------|                            |
 *
@@ -46,63 +45,26 @@
 * and RAY_SHADOW_RAY_CAST_DL respectively, during kernel entry.
 * QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty at kernel exit.
 */
-
+ccl_device void kernel_shadow_blocked(
-__kernel void kernel_shadow_blocked(
+        ccl_global char *globals,
-	ccl_global char *globals,
+        ccl_constant KernelData *data,
-	ccl_constant KernelData *data,
+        ccl_global char *shader_shadow,        /* Required for shadow blocked */
-	ccl_global char *shader_shadow,             /* Required for shadow blocked */
+        ccl_global PathState *PathState_coop,  /* Required for shadow blocked */
-	ccl_global PathState *PathState_coop,       /* Required for shadow blocked */
+        ccl_global Ray *LightRay_dl_coop,      /* Required for direct lighting's shadow blocked */
-	ccl_global Ray *LightRay_dl_coop,           /* Required for direct lighting's shadow blocked */
+        ccl_global Ray *LightRay_ao_coop,      /* Required for AO's shadow blocked */
-	ccl_global Ray *LightRay_ao_coop,           /* Required for AO's shadow blocked */
+        Intersection *Intersection_coop_AO,
-	Intersection *Intersection_coop_AO,
+        Intersection *Intersection_coop_DL,
-	Intersection *Intersection_coop_DL,
+        ccl_global char *ray_state,
-	ccl_global char *ray_state,
+        int total_num_rays,
-	ccl_global int *Queue_data,                 /* Queue memory */
+        char shadow_blocked_type,
-	ccl_global int *Queue_index,                /* Tracks the number of elements in each queue */
+        int ray_index)
 	int queuesize,                              /* Size (capacity) of each queue */
 	int total_num_rays
 	)
 {
-#if 0
+	/* Flag determining if we need to update L. */
 	/* we will make the Queue_index entries '0' in the next kernel */
 	if(get_global_id(0) == 0 && get_global_id(1) == 0) {
 		/* We empty this queue here */
 		Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
 		Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
 	}
 #endif
 	int lidx = get_local_id(1) * get_local_id(0) + get_local_id(0);
 	ccl_local unsigned int ao_queue_length;
 	ccl_local unsigned int dl_queue_length;
 	if(lidx == 0) {
 		ao_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS];
 		dl_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS];
 	}
 	barrier(CLK_LOCAL_MEM_FENCE);
 	/* flag determining if the current ray is to process shadow ray for AO or DL */
 	char shadow_blocked_type = -1;
 	/* flag determining if we need to update L */
 	char update_path_radiance = 0;
-	int ray_index = QUEUE_EMPTY_SLOT;
+	if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) ||
-	int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+	   IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO))
-	if(thread_index < ao_queue_length + dl_queue_length) {
+	{
 		if(thread_index < ao_queue_length) {
 			ray_index = get_ray_index(thread_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS, Queue_data, queuesize, 1);
 			shadow_blocked_type = RAY_SHADOW_RAY_CAST_AO;
 		} else {
 			ray_index = get_ray_index(thread_index - ao_queue_length, QUEUE_SHADOW_RAY_CAST_DL_RAYS, Queue_data, queuesize, 1);
 			shadow_blocked_type = RAY_SHADOW_RAY_CAST_DL;
 		}
 	}
 	if(ray_index == QUEUE_EMPTY_SLOT)
 		return;
 	if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) || IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) {
 		/* Load kernel global structure */
 		KernelGlobals *kg = (KernelGlobals *)globals;
 		ShaderData *sd_shadow  = (ShaderData *)shader_shadow;
@@ -113,13 +75,24 @@ __kernel void kernel_shadow_blocked(
 		Intersection *isect_ao_global = &Intersection_coop_AO[ray_index];
 		Intersection *isect_dl_global = &Intersection_coop_DL[ray_index];
-		ccl_global Ray *light_ray_global = shadow_blocked_type == RAY_SHADOW_RAY_CAST_AO ? light_ray_ao_global : light_ray_dl_global;
+		ccl_global Ray *light_ray_global =
-		Intersection *isect_global = RAY_SHADOW_RAY_CAST_AO ? isect_ao_global : isect_dl_global;
+		        shadow_blocked_type == RAY_SHADOW_RAY_CAST_AO
 		                 ? light_ray_ao_global
 		                 : light_ray_dl_global;
 		Intersection *isect_global =
 		        RAY_SHADOW_RAY_CAST_AO ? isect_ao_global : isect_dl_global;
 		float3 shadow;
-		update_path_radiance = !(shadow_blocked(kg, state, light_ray_global, &shadow, sd_shadow, isect_global));
+		update_path_radiance = !(shadow_blocked(kg,
 		                                        state,
 		                                        light_ray_global,
 		                                        &shadow,
 		                                        sd_shadow,
 		                                        isect_global));
-		/* We use light_ray_global's P and t to store shadow and update_path_radiance */
+		/* We use light_ray_global's P and t to store shadow and
 		 * update_path_radiance.
 		 */
 		light_ray_global->P = shadow;
 		light_ray_global->t = update_path_radiance;
 	}
--- a/intern/cycles/kernel/split/kernel_sum_all_radiance.h
+++ b/intern/cycles/kernel/split/kernel_sum_all_radiance.h
@@ -19,21 +19,19 @@
 #include "../kernel_types.h"
 #include "../kernel_globals.h"
-/*
+/* Since we process various samples in parallel; The output radiance of different samples
-* Since we process various samples in parallel; The output radiance of different samples
+ * are stored in different locations; This kernel combines the output radiance contributed
-* are stored in different locations; This kernel combines the output radiance contributed
+ * by all different samples and stores them in the RenderTile's output buffer.
-* by all different samples and stores them in the RenderTile's output buffer.
+ */
-*/
+ccl_device void kernel_sum_all_radiance(
-
+        ccl_constant KernelData *data,               /* To get pass_stride to offet into buffer */
-__kernel void kernel_sum_all_radiance(
+        ccl_global float *buffer,                    /* Output buffer of RenderTile */
-	ccl_constant KernelData *data,               /* To get pass_stride to offet into buffer */
+        ccl_global float *per_sample_output_buffer,  /* Radiance contributed by all samples */
-	ccl_global float *buffer,                    /* Output buffer of RenderTile */
+        int parallel_samples, int sw, int sh, int stride,
-	ccl_global float *per_sample_output_buffer,  /* Radiance contributed by all samples */
+        int buffer_offset_x,
-	int parallel_samples, int sw, int sh, int stride,
+        int buffer_offset_y,
-	int buffer_offset_x,
+        int buffer_stride,
-	int buffer_offset_y,
+        int start_sample)
 	int buffer_stride,
 	int start_sample)
 {
 	int x = get_global_id(0);
 	int y = get_global_id(1);