Cycles: Implement record-all transparent shadow function for GPU

The idea is to record all possible transparent intersections when shooting transparent ray on GPU (similar to what we were doing on CPU already). This avoids need of doing whole ray-to-scene intersections queries for each intersection and speeds up a lot cases like transparent hair in the cost of extra memory. This commit is a base ground for now and this feature is kept disabled for until some further tweaks.
2016-09-21 17:34:15 +02:00
parent 9c3d202e56
commit 9830eeb44b
2 changed files with 70 additions and 50 deletions
--- a/intern/cycles/kernel/bvh/bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -357,7 +357,7 @@ ccl_device_inline float3 ray_offset(float3 P, float3 Ng)
 #endif
 }
-#if defined(__SHADOW_RECORD_ALL__) || defined (__VOLUME_RECORD_ALL__)
+#if defined(__SHADOW_RECORD_ALL__) || defined(__VOLUME_RECORD_ALL__)
 /* ToDo: Move to another file? */
 ccl_device int intersections_compare(const void *a, const void *b)
 {
@@ -371,7 +371,25 @@ ccl_device int intersections_compare(const void *a, const void *b)
 	else
 		return 0;
 }
 ccl_device_inline void sort_intersections(Intersection *hits, uint num_hits)
 {
 #ifdef __KERNEL_GPU__
 	/* Use bubble sort which has more friendly memory pattern on GPU. */
 	int i, j;
 	for(i = 0; i < num_hits; ++i) {
 		for(j = 0; j < num_hits - 1; ++j) {
 			if(hits[j].t < hits[j + 1].t) {
 				Intersection tmp = hits[j];
 				hits[j] = hits[j + 1];
 				hits[j + 1] = tmp;
 			}
 		}
 	}
 #else
 	qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
 #endif
 }
 #endif  /* __SHADOW_RECORD_ALL__ | __VOLUME_RECORD_ALL__ */
 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/kernel_shadow.h
+++ b/intern/cycles/kernel/kernel_shadow.h
@@ -34,10 +34,8 @@ ccl_device_inline bool shadow_handle_transparent_isect(KernelGlobals *kg,
 		kernel_volume_shadow(kg, shadow_sd, state, &segment_ray, throughput);
 	}
 #endif
 	/* Setup shader data at surface. */
 	shader_setup_from_ray(kg, shadow_sd, isect, ray);
 	/* Attenuation from transparent surface. */
 	if(!(shadow_sd->flag & SD_HAS_ONLY_VOLUME)) {
 		path_state_modify_bounce(state, true);
@@ -51,42 +49,19 @@ ccl_device_inline bool shadow_handle_transparent_isect(KernelGlobals *kg,
 		path_state_modify_bounce(state, false);
 		*throughput *= shader_bsdf_transparency(kg, shadow_sd);
 	}
 	/* Stop if all light is blocked. */
 	if(is_zero(*throughput)) {
 		return true;
 	}
 #ifdef __VOLUME__
 	/* Exit/enter volume. */
 	kernel_volume_stack_enter_exit(kg, shadow_sd, state->volume_stack);
 #endif
 	return false;
 }
 #ifdef __SHADOW_RECORD_ALL__
-
+/* Shadow function to compute how much light is blocked,
 ccl_device_inline void sort_intersections(Intersection *hits, uint num_hits)
 {
 #ifdef __KERNEL_GPU__
 	/* Use bubble sort which has more friendly memory pattern on GPU. */
 	int i, j;
 	for(i = 0; i < num_hits; ++i) {
 		for(j = 0; j < num_hits - 1; ++j) {
 			if(hits[j].t < hits[j + 1].t) {
 				Intersection tmp = hits[j];
 				hits[j] = hits[j + 1];
 				hits[j + 1] = tmp;
 			}
 		}
 	}
 #else
 	qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
 #endif
 }
 /* Shadow function to compute how much light is blocked, CPU variation.
 *
 * We trace a single ray. If it hits any opaque surface, or more than a given
 * number of transparent surfaces is hit, then we consider the geometry to be
@@ -104,12 +79,20 @@ ccl_device_inline void sort_intersections(Intersection *hits, uint num_hits)
 * or there is a performance increase anyway due to avoiding the need to send
 * two rays with transparent shadows.
 *
- * This is CPU only because of qsort, and malloc or high stack space usage to
+ * On CPU it'll handle all transparent bounces (by allocating storage for
- * record all these intersections. */
+ * intersections when they don't fit into the stack storage).
 *
 * On GPU it'll only handle SHADOW_STACK_MAX_HITS-1 intersections, so this
 * is something to be kept an eye on.
 */
-#define STACK_MAX_HITS 64
+#define SHADOW_STACK_MAX_HITS 64
-ccl_device_inline bool shadow_blocked(KernelGlobals *kg, ShaderData *shadow_sd, PathState *state, Ray *ray, float3 *shadow)
+ccl_device_inline bool shadow_blocked_all(KernelGlobals *kg,
                                          ShaderData *shadow_sd,
                                          PathState *state,
                                          Ray *ray,
                                          float3 *shadow)
 {
 	*shadow = make_float3(1.0f, 1.0f, 1.0f);
 	if(ray->t == 0.0f) {
@@ -126,7 +109,7 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, ShaderData *shadow_sd,
 		/* Intersect to find an opaque surface, or record all transparent
 		 * surface hits.
 		 */
-		Intersection hits_stack[STACK_MAX_HITS];
+		Intersection hits_stack[SHADOW_STACK_MAX_HITS];
 		Intersection *hits = hits_stack;
 		const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce;
 		uint max_hits = transparent_max_bounce - state->transparent_bounce - 1;
@@ -138,7 +121,7 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, ShaderData *shadow_sd,
 		 *
 		 * Ignore this on GPU because of slow/unavailable malloc().
 		 */
-		if(max_hits + 1 > STACK_MAX_HITS) {
+		if(max_hits + 1 > SHADOW_STACK_MAX_HITS) {
 			if(kg->transparent_shadow_intersections == NULL) {
 				kg->transparent_shadow_intersections =
 				    (Intersection*)malloc(sizeof(Intersection)*(transparent_max_bounce + 1));
@@ -211,30 +194,27 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, ShaderData *shadow_sd,
 #endif
 	return blocked;
 }
 #endif  /* __SHADOW_RECORD_ALL__ */
-#undef STACK_MAX_HITS
+#ifndef __KERNEL_CPU__
-
+/* Shadow function to compute how much light is blocked,
 #else
 /* Shadow function to compute how much light is blocked, GPU variation.
 *
 * Here we raytrace from one transparent surface to the next step by step.
 * To minimize overhead in cases where we don't need transparent shadows, we
 * first trace a regular shadow ray. We check if the hit primitive was
 * potentially transparent, and only in that case start marching. this gives
- * one extra ray cast for the cases were we do want transparency. */
+ * one extra ray cast for the cases were we do want transparency.
-
+ */
-ccl_device_noinline bool shadow_blocked(KernelGlobals *kg,
+ccl_device_noinline bool shadow_blocked_stepped(KernelGlobals *kg,
-                                        ShaderData *shadow_sd,
+                                                ShaderData *shadow_sd,
-                                        ccl_addr_space PathState *state,
+                                                ccl_addr_space PathState *state,
-                                        ccl_addr_space Ray *ray_input,
+                                                ccl_addr_space Ray *ray_input,
-                                        float3 *shadow)
+                                                float3 *shadow)
 {
 	*shadow = make_float3(1.0f, 1.0f, 1.0f);
-
+	if(ray_input->t == 0.0f) {
 	if(ray_input->t == 0.0f)
 		return false;
-
+	}
 #ifdef __SPLIT_KERNEL__
 	Ray private_ray = *ray_input;
 	Ray *ray = &private_ray;
@@ -313,10 +293,32 @@ ccl_device_noinline bool shadow_blocked(KernelGlobals *kg,
 	}
 #  endif
 #endif
 	return blocked;
 }
 #endif  /* __KERNEL_CPU__ */
 ccl_device_inline bool shadow_blocked(KernelGlobals *kg,
                                      ShaderData *shadow_sd,
                                      PathState *state,
                                      Ray *ray,
                                      float3 *shadow)
 {
 #ifdef __SHADOW_RECORD_ALL__
 #  ifdef __KERNEL_CPU__
 	return shadow_blocked_all(kg, shadow_sd, state, ray, shadow);
 #  else
 	const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce;
 	const uint max_hits = transparent_max_bounce - state->transparent_bounce - 1;
 	if(max_hits + 1 < SHADOW_STACK_MAX_HITS) {
 		return shadow_blocked_all(kg, shadow_sd, state, ray, shadow);
 	}
 #  endif
 #endif
 #ifndef __KERNEL_CPU__
 	return shadow_blocked_stepped(kg, shadow_sd, state, ray, shadow);
 #endif
 }
 #undef SHADOW_STACK_MAX_HITS
 CCL_NAMESPACE_END