Cycles: Trace indirect subsurface rays by restarting the integrator loop

This gives much lower stack usage on GPU and reduces kernel memory size to
around 448MB on GTX560Ti (comparing to 652MB with previous commit and 946MB
with official release). There's also a barely measurable speedup of around
5%, but this is to be confirmed still.

At this stage we're using only ~3% for the experimental kernel and SSS
rendering seems to be faster by 40% and after some further testing we might
consider making SSS and CMJ official features and remove experimental
precompiled kernels.
This commit is contained in:
Sergey Sharybin
2015-11-22 16:08:03 +05:00
parent 2a5c1fc9cc
commit 26f1c51ca6
2 changed files with 71 additions and 42 deletions

View File

@@ -74,7 +74,22 @@ ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadian
&throughput, &throughput,
&ss_indirect)) &ss_indirect))
{ {
kernel_path_subsurface_scatter_indirect(kg, &L_sample, &state, &rng, &ray, &ss_indirect); while(ss_indirect.num_rays) {
kernel_path_subsurface_setup_indirect(kg,
&ss_indirect,
&L_sample,
&state,
&ray,
&ray,
&throughput);
kernel_path_indirect(kg,
&rng,
&ray,
throughput,
state.num_samples,
&state,
&L_sample);
}
is_sss_sample = true; is_sss_sample = true;
} }
} }

View File

@@ -448,6 +448,12 @@ ccl_device bool kernel_path_subsurface_scatter(
/* do bssrdf scatter step if we picked a bssrdf closure */ /* do bssrdf scatter step if we picked a bssrdf closure */
if(sc) { if(sc) {
/* We should never have two consecutive BSSRDF bounces,
* the second one should be converted to a diffuse BSDF to
* avoid this.
*/
kernel_assert(ss_indirect->num_rays == 0);
uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb); uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb);
SubsurfaceIntersection ss_isect; SubsurfaceIntersection ss_isect;
@@ -510,50 +516,44 @@ ccl_device bool kernel_path_subsurface_scatter(
return false; return false;
} }
/* Trace subsurface indirect rays separately after the path loop, to reduce ccl_device void kernel_path_subsurface_setup_indirect(
* GPU stack memory usage. this way ShaderData and other data structures
* used during the loop are not needed during kernel_path_indirect.
*/
ccl_device void kernel_path_subsurface_scatter_indirect(
KernelGlobals *kg, KernelGlobals *kg,
SubsurfaceIndirectRays *ss_indirect,
PathRadiance *L, PathRadiance *L,
PathState *state, PathState *state,
RNG *rng, Ray *orig_ray,
Ray *ray, Ray *ray,
SubsurfaceIndirectRays *ss_indirect) float3 *throughput)
{ {
for (int i = 0; i < ss_indirect->num_rays; i++) { /* Setup state, ray and throughput for indirect SSS rays. */
Ray *indirect_ray = &ss_indirect->rays[i]; ss_indirect->num_rays--;
float3 indirect_throughput = ss_indirect->throughputs[i];
Ray *indirect_ray = &ss_indirect->rays[ss_indirect->num_rays];
*state = ss_indirect->state; *state = ss_indirect->state;
*throughput = ss_indirect->throughputs[ss_indirect->num_rays];
#ifdef __VOLUME__ #ifdef __VOLUME__
if(ss_indirect->need_update_volume_stack) { if(ss_indirect->need_update_volume_stack) {
/* TODO(sergey): Single assignment per scatter. */ Ray volume_ray = *orig_ray;
Ray volume_ray = *ray;
/* Setup ray from previous surface point to the new one. */ /* Setup ray from previous surface point to the new one. */
volume_ray.D = normalize_len(indirect_ray->P - volume_ray.P, volume_ray.D = normalize_len(indirect_ray->P - volume_ray.P,
&volume_ray.t); &volume_ray.t);
kernel_volume_stack_update_for_subsurface( kernel_volume_stack_update_for_subsurface(kg,
kg,
&volume_ray, &volume_ray,
state->volume_stack); state->volume_stack);
} }
#endif #endif
/* Note that this modifies state. */ *ray = *indirect_ray;
kernel_path_indirect(kg, rng, indirect_ray, indirect_throughput, state->num_samples, state, L);
/* For render passes, sum and reset indirect light pass variables /* For render passes, sum and reset indirect light pass variables
* for the next samples. * for the next samples.
*/ */
path_radiance_sum_indirect(L); path_radiance_sum_indirect(L);
path_radiance_reset_indirect(L); path_radiance_reset_indirect(L);
}
} }
#endif #endif
@@ -577,6 +577,14 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
#ifdef __SUBSURFACE__ #ifdef __SUBSURFACE__
SubsurfaceIndirectRays ss_indirect; SubsurfaceIndirectRays ss_indirect;
ss_indirect.num_rays = 0; ss_indirect.num_rays = 0;
/* TODO(sergey): Avoid having explicit copy of the pre-subsurface scatter
* ray by storing an updated version of state in the ss_indirect which will
* be updated to the new volume stack.
*/
Ray ss_orig_ray;
for(;;) {
#endif #endif
/* path iteration */ /* path iteration */
@@ -825,6 +833,7 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
&throughput, &throughput,
&ss_indirect)) &ss_indirect))
{ {
ss_orig_ray = ray;
break; break;
} }
} }
@@ -839,16 +848,21 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
} }
#ifdef __SUBSURFACE__ #ifdef __SUBSURFACE__
/* Trace indirect subsurface afterwards to reduce GPU stack size. /* Trace indirect subsurface rays by restarting the loop. this uses less
* note that this modifies state. * stack memory than invoking kernel_path_indirect.
*/ */
if (ss_indirect.num_rays) { if(ss_indirect.num_rays) {
kernel_path_subsurface_scatter_indirect(kg, kernel_path_subsurface_setup_indirect(kg,
&ss_indirect,
&L, &L,
&state, &state,
rng, &ss_orig_ray,
&ray, &ray,
&ss_indirect); &throughput);
}
else {
break;
}
} }
#endif #endif