Cycles: Trace indirect subsurface rays by restarting the integrator loop

This gives much lower stack usage on GPU and reduces kernel memory size to around 448MB on GTX560Ti (comparing to 652MB with previous commit and 946MB with official release). There's also a barely measurable speedup of around 5%, but this is to be confirmed still. At this stage we're using only ~3% for the experimental kernel and SSS rendering seems to be faster by 40% and after some further testing we might consider making SSS and CMJ official features and remove experimental precompiled kernels.
2015-11-22 16:08:03 +05:00
parent 2a5c1fc9cc
commit 26f1c51ca6
2 changed files with 71 additions and 42 deletions
--- a/intern/cycles/kernel/kernel_bake.h
+++ b/intern/cycles/kernel/kernel_bake.h
@@ -74,7 +74,22 @@ ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadian
 			                                  &throughput,
 			                                  &ss_indirect))
 			{
-				kernel_path_subsurface_scatter_indirect(kg, &L_sample, &state, &rng, &ray, &ss_indirect);
+				while(ss_indirect.num_rays) {
 					kernel_path_subsurface_setup_indirect(kg,
 					                                      &ss_indirect,
 					                                      &L_sample,
 					                                      &state,
 					                                      &ray,
 					                                      &ray,
 					                                      &throughput);
 					kernel_path_indirect(kg,
 					                     &rng,
 					                     &ray,
 					                     throughput,
 					                     state.num_samples,
 					                     &state,
 					                     &L_sample);
 				}
 				is_sss_sample = true;
 			}
 		}
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -448,6 +448,12 @@ ccl_device bool kernel_path_subsurface_scatter(
 	/* do bssrdf scatter step if we picked a bssrdf closure */
 	if(sc) {
 		/* We should never have two consecutive BSSRDF bounces,
 		 * the second one should be converted to a diffuse BSDF to
 		 * avoid this.
 		 */
 		kernel_assert(ss_indirect->num_rays == 0);
 		uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb);
 		SubsurfaceIntersection ss_isect;
@@ -510,50 +516,44 @@ ccl_device bool kernel_path_subsurface_scatter(
 	return false;
 }
-/* Trace subsurface indirect rays separately after the path loop, to reduce
+ccl_device void kernel_path_subsurface_setup_indirect(
 * GPU stack memory usage. this way ShaderData and other data structures
 * used during the loop are not needed during kernel_path_indirect.
 */
 ccl_device void kernel_path_subsurface_scatter_indirect(
        KernelGlobals *kg,
        SubsurfaceIndirectRays *ss_indirect,
        PathRadiance *L,
        PathState *state,
-        RNG *rng,
+        Ray *orig_ray,
        Ray *ray,
-        SubsurfaceIndirectRays *ss_indirect)
+        float3 *throughput)
 {
-	for (int i = 0; i < ss_indirect->num_rays; i++) {
+	/* Setup state, ray and throughput for indirect SSS rays. */
-		Ray *indirect_ray = &ss_indirect->rays[i];
+	ss_indirect->num_rays--;
-		float3 indirect_throughput = ss_indirect->throughputs[i];
+
 	Ray *indirect_ray = &ss_indirect->rays[ss_indirect->num_rays];
 	*state = ss_indirect->state;
 	*throughput = ss_indirect->throughputs[ss_indirect->num_rays];
 #ifdef __VOLUME__
 	if(ss_indirect->need_update_volume_stack) {
-			/* TODO(sergey): Single assignment per scatter. */
+		Ray volume_ray = *orig_ray;
 			Ray volume_ray = *ray;
 		/* Setup ray from previous surface point to the new one. */
 		volume_ray.D = normalize_len(indirect_ray->P - volume_ray.P,
 		                             &volume_ray.t);
-			kernel_volume_stack_update_for_subsurface(
+		kernel_volume_stack_update_for_subsurface(kg,
 			        kg,
 		                                          &volume_ray,
 		                                          state->volume_stack);
 	}
 #endif
-		/* Note that this modifies state. */
+	*ray = *indirect_ray;
 		kernel_path_indirect(kg, rng, indirect_ray, indirect_throughput, state->num_samples, state, L);
 	/* For render passes, sum and reset indirect light pass variables
 	 * for the next samples.
 	 */
 	path_radiance_sum_indirect(L);
 	path_radiance_reset_indirect(L);
 	}
 }
 #endif
@@ -577,6 +577,14 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
 #ifdef __SUBSURFACE__
 	SubsurfaceIndirectRays ss_indirect;
 	ss_indirect.num_rays = 0;
 	/* TODO(sergey): Avoid having explicit copy of the pre-subsurface scatter
 	 * ray by storing an updated version of state in the ss_indirect which will
 	 * be updated to the new volume stack.
 	 */
 	Ray ss_orig_ray;
 	for(;;) {
 #endif
 	/* path iteration */
@@ -825,6 +833,7 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
 			                                  &throughput,
 			                                  &ss_indirect))
 			{
 				ss_orig_ray = ray;
 				break;
 			}
 		}
@@ -839,16 +848,21 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
 	}
 #ifdef __SUBSURFACE__
-	/* Trace indirect subsurface afterwards to reduce GPU stack size.
+		/* Trace indirect subsurface rays by restarting the loop. this uses less
-	 * note that this modifies state.
+		 * stack memory than invoking kernel_path_indirect.
 		 */
-	if (ss_indirect.num_rays) {
+		if(ss_indirect.num_rays) {
-		kernel_path_subsurface_scatter_indirect(kg,
+			kernel_path_subsurface_setup_indirect(kg,
 			                                      &ss_indirect,
 			                                      &L,
 			                                      &state,
-		                                        rng,
+			                                      &ss_orig_ray,
 			                                      &ray,
-		                                        &ss_indirect);
+			                                      &throughput);
 		}
 		else {
 			break;
 		}
 	}
 #endif