Cycles CUDA: reduce stack memory by reusing ShaderData.

57% less for path and 48% less for branched path.
This commit is contained in:
Brecht Van Lommel
2016-05-22 22:35:47 +02:00
parent af4a04eae0
commit 999d5a6785
14 changed files with 196 additions and 169 deletions

View File

@@ -53,6 +53,7 @@
CCL_NAMESPACE_BEGIN
ccl_device void kernel_path_indirect(KernelGlobals *kg,
ShaderData *emission_sd,
RNG *rng,
Ray *ray,
float3 throughput,
@@ -60,6 +61,9 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
PathState *state,
PathRadiance *L)
{
/* shader data memory used for both volumes and surfaces, saves stack space */
ShaderData sd;
/* path iteration */
for(;;) {
/* intersect scene */
@@ -87,7 +91,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
/* intersect with lamp */
float3 emission;
if(indirect_lamp_emission(kg, state, &light_ray, &emission)) {
if(indirect_lamp_emission(kg, emission_sd, state, &light_ray, &emission)) {
path_radiance_accum_emission(L,
throughput,
emission,
@@ -115,15 +119,14 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
if(decoupled) {
/* cache steps along volume for repeated sampling */
VolumeSegment volume_segment;
ShaderData volume_sd;
shader_setup_from_volume(kg,
&volume_sd,
&sd,
&volume_ray);
kernel_volume_decoupled_record(kg,
state,
&volume_ray,
&volume_sd,
&sd,
&volume_segment,
heterogeneous);
@@ -146,7 +149,8 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
/* direct light sampling */
kernel_branched_path_volume_connect_light(kg,
rng,
&volume_sd,
&sd,
emission_sd,
throughput,
state,
L,
@@ -163,7 +167,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
result = kernel_volume_decoupled_scatter(kg,
state,
&volume_ray,
&volume_sd,
&sd,
&throughput,
rphase,
rscatter,
@@ -178,7 +182,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
if(result == VOLUME_PATH_SCATTERED) {
if(kernel_path_volume_bounce(kg,
rng,
&volume_sd,
&sd,
&throughput,
state,
L,
@@ -198,16 +202,16 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
# endif
{
/* integrate along volume segment with distance sampling */
ShaderData volume_sd;
VolumeIntegrateResult result = kernel_volume_integrate(
kg, state, &volume_sd, &volume_ray, L, &throughput, rng, heterogeneous);
kg, state, &sd, &volume_ray, L, &throughput, rng, heterogeneous);
# ifdef __VOLUME_SCATTER__
if(result == VOLUME_PATH_SCATTERED) {
/* direct lighting */
kernel_path_volume_connect_light(kg,
rng,
&volume_sd,
&sd,
emission_sd,
throughput,
state,
L);
@@ -215,7 +219,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
/* indirect light bounce */
if(kernel_path_volume_bounce(kg,
rng,
&volume_sd,
&sd,
&throughput,
state,
L,
@@ -235,7 +239,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
if(!hit) {
#ifdef __BACKGROUND__
/* sample background shader */
float3 L_background = indirect_background(kg, state, ray);
float3 L_background = indirect_background(kg, emission_sd, state, ray);
path_radiance_accum_background(L,
throughput,
L_background,
@@ -246,7 +250,6 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
}
/* setup shading */
ShaderData sd;
shader_setup_from_ray(kg,
&sd,
&isect,
@@ -328,7 +331,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
light_ray.dP = sd.dP;
light_ray.dD = differential3_zero();
if(!shadow_blocked(kg, state, &light_ray, &ao_shadow)) {
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) {
path_radiance_accum_ao(L,
throughput,
ao_alpha,
@@ -378,6 +381,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
kernel_branched_path_surface_connect_light(kg,
rng,
&sd,
emission_sd,
state,
throughput,
1.0f,
@@ -393,6 +397,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
ShaderData *sd,
ShaderData *emission_sd,
PathRadiance *L,
PathState *state,
RNG *rng,
@@ -425,7 +430,7 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
light_ray.dP = ccl_fetch(sd, dP);
light_ray.dD = differential3_zero();
if(!shadow_blocked(kg, state, &light_ray, &ao_shadow))
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow))
path_radiance_accum_ao(L, throughput, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
}
}
@@ -435,6 +440,7 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
ccl_device bool kernel_path_subsurface_scatter(
KernelGlobals *kg,
ShaderData *sd,
ShaderData *emission_sd,
PathRadiance *L,
PathState *state,
RNG *rng,
@@ -503,7 +509,7 @@ ccl_device bool kernel_path_subsurface_scatter(
hit_L->direct_throughput = L->direct_throughput;
path_radiance_copy_indirect(hit_L, L);
kernel_path_surface_connect_light(kg, rng, sd, *hit_tp, state, hit_L);
kernel_path_surface_connect_light(kg, rng, sd, emission_sd, *hit_tp, state, hit_L);
if(kernel_path_surface_bounce(kg,
rng,
@@ -526,6 +532,7 @@ ccl_device bool kernel_path_subsurface_scatter(
kernel_volume_stack_update_for_subsurface(
kg,
emission_sd,
&volume_ray,
hit_state->volume_stack);
}
@@ -604,8 +611,13 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
path_radiance_init(&L, kernel_data.film.use_light_pass);
/* shader data memory used for both volumes and surfaces, saves stack space */
ShaderData sd;
/* shader data used by emission, shadows, volume stacks */
ShaderData emission_sd;
PathState state;
path_state_init(kg, &state, rng, sample, &ray);
path_state_init(kg, &emission_sd, &state, rng, sample, &ray);
#ifdef __KERNEL_DEBUG__
DebugData debug_data;
@@ -669,7 +681,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
/* intersect with lamp */
float3 emission;
if(indirect_lamp_emission(kg, &state, &light_ray, &emission))
if(indirect_lamp_emission(kg, &emission_sd, &state, &light_ray, &emission))
path_radiance_accum_emission(&L, throughput, emission, state.bounce);
}
#endif
@@ -689,11 +701,10 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
if(decoupled) {
/* cache steps along volume for repeated sampling */
VolumeSegment volume_segment;
ShaderData volume_sd;
shader_setup_from_volume(kg, &volume_sd, &volume_ray);
shader_setup_from_volume(kg, &sd, &volume_ray);
kernel_volume_decoupled_record(kg, &state,
&volume_ray, &volume_sd, &volume_segment, heterogeneous);
&volume_ray, &sd, &volume_segment, heterogeneous);
volume_segment.sampling_method = sampling_method;
@@ -708,8 +719,9 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
int all = false;
/* direct light sampling */
kernel_branched_path_volume_connect_light(kg, rng, &volume_sd,
throughput, &state, &L, all, &volume_ray, &volume_segment);
kernel_branched_path_volume_connect_light(kg, rng, &sd,
&emission_sd, throughput, &state, &L, all,
&volume_ray, &volume_segment);
/* indirect sample. if we use distance sampling and take just
* one sample for direct and indirect light, we could share
@@ -718,7 +730,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
float rscatter = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_SCATTER_DISTANCE);
result = kernel_volume_decoupled_scatter(kg,
&state, &volume_ray, &volume_sd, &throughput,
&state, &volume_ray, &sd, &throughput,
rphase, rscatter, &volume_segment, NULL, true);
}
@@ -726,7 +738,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
kernel_volume_decoupled_free(kg, &volume_segment);
if(result == VOLUME_PATH_SCATTERED) {
if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray))
if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
continue;
else
break;
@@ -739,17 +751,16 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
# endif
{
/* integrate along volume segment with distance sampling */
ShaderData volume_sd;
VolumeIntegrateResult result = kernel_volume_integrate(
kg, &state, &volume_sd, &volume_ray, &L, &throughput, rng, heterogeneous);
kg, &state, &sd, &volume_ray, &L, &throughput, rng, heterogeneous);
# ifdef __VOLUME_SCATTER__
if(result == VOLUME_PATH_SCATTERED) {
/* direct lighting */
kernel_path_volume_connect_light(kg, rng, &volume_sd, throughput, &state, &L);
kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, &L);
/* indirect light bounce */
if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray))
if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
continue;
else
break;
@@ -772,7 +783,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
#ifdef __BACKGROUND__
/* sample background shader */
float3 L_background = indirect_background(kg, &state, &ray);
float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
path_radiance_accum_background(&L, throughput, L_background, state.bounce);
#endif
@@ -780,7 +791,6 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
}
/* setup shading */
ShaderData sd;
shader_setup_from_ray(kg, &sd, &isect, &ray);
float rbsdf = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_BSDF);
shader_eval_surface(kg, &sd, &state, rbsdf, state.flag, SHADER_CONTEXT_MAIN);
@@ -848,7 +858,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
#ifdef __AO__
/* ambient occlusion */
if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
kernel_path_ao(kg, &sd, &L, &state, rng, throughput);
kernel_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput);
}
#endif
@@ -858,6 +868,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
if(sd.flag & SD_BSSRDF) {
if(kernel_path_subsurface_scatter(kg,
&sd,
&emission_sd,
&L,
&state,
rng,
@@ -871,7 +882,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
#endif /* __SUBSURFACE__ */
/* direct lighting */
kernel_path_surface_connect_light(kg, rng, &sd, throughput, &state, &L);
kernel_path_surface_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, &L);
/* compute direct lighting and next bounce */
if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))