Cycles: Store shadow intersections in the kernel globals
Seems CUDA failed to de-duplicate the array across multiple inlined versions of the shadow_blocked(). Helped it a bit with that now. Gives about 100MB memory improvement on a scenes after previous commit and brings up memory "regression" to only 100MB comparing to the master branch now.
This commit is contained in:
@@ -76,7 +76,10 @@ typedef struct KernelGlobals {
|
|||||||
#ifdef __KERNEL_CUDA__
|
#ifdef __KERNEL_CUDA__
|
||||||
|
|
||||||
__constant__ KernelData __data;
|
__constant__ KernelData __data;
|
||||||
typedef struct KernelGlobals {} KernelGlobals;
|
typedef struct KernelGlobals {
|
||||||
|
/* NOTE: Keep the size in sync with SHADOW_STACK_MAX_HITS. */
|
||||||
|
Intersection hits_stack[64];
|
||||||
|
} KernelGlobals;
|
||||||
|
|
||||||
# ifdef __KERNEL_CUDA_TEX_STORAGE__
|
# ifdef __KERNEL_CUDA_TEX_STORAGE__
|
||||||
# define KERNEL_TEX(type, ttype, name) ttype name;
|
# define KERNEL_TEX(type, ttype, name) ttype name;
|
||||||
|
@@ -109,8 +109,12 @@ ccl_device_inline bool shadow_blocked_all(KernelGlobals *kg,
|
|||||||
/* Intersect to find an opaque surface, or record all transparent
|
/* Intersect to find an opaque surface, or record all transparent
|
||||||
* surface hits.
|
* surface hits.
|
||||||
*/
|
*/
|
||||||
|
#ifdef __KERNEL_CUDA__
|
||||||
|
Intersection *hits = kg->hits_stack;
|
||||||
|
#else
|
||||||
Intersection hits_stack[SHADOW_STACK_MAX_HITS];
|
Intersection hits_stack[SHADOW_STACK_MAX_HITS];
|
||||||
Intersection *hits = hits_stack;
|
Intersection *hits = hits_stack;
|
||||||
|
#endif
|
||||||
const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce;
|
const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce;
|
||||||
uint max_hits = transparent_max_bounce - state->transparent_bounce - 1;
|
uint max_hits = transparent_max_bounce - state->transparent_bounce - 1;
|
||||||
#ifndef __KERNEL_GPU__
|
#ifndef __KERNEL_GPU__
|
||||||
@@ -247,6 +251,7 @@ ccl_device_noinline bool shadow_blocked_stepped(KernelGlobals *kg,
|
|||||||
for(;;) {
|
for(;;) {
|
||||||
if(bounce >= kernel_data.integrator.transparent_max_bounce) {
|
if(bounce >= kernel_data.integrator.transparent_max_bounce) {
|
||||||
return true;
|
return true;
|
||||||
|
}
|
||||||
if(!scene_intersect(kg,
|
if(!scene_intersect(kg,
|
||||||
*ray,
|
*ray,
|
||||||
PATH_RAY_SHADOW_TRANSPARENT,
|
PATH_RAY_SHADOW_TRANSPARENT,
|
||||||
|
@@ -130,8 +130,10 @@ kernel_cuda_path_trace(float *buffer, uint *rng_state, int sample, int sx, int s
|
|||||||
int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
|
int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
|
int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
|
||||||
|
|
||||||
if(x < sx + sw && y < sy + sh)
|
if(x < sx + sw && y < sy + sh) {
|
||||||
kernel_path_trace(NULL, buffer, rng_state, sample, x, y, offset, stride);
|
KernelGlobals kg;
|
||||||
|
kernel_path_trace(&kg, buffer, rng_state, sample, x, y, offset, stride);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef __BRANCHED_PATH__
|
#ifdef __BRANCHED_PATH__
|
||||||
@@ -142,8 +144,10 @@ kernel_cuda_branched_path_trace(float *buffer, uint *rng_state, int sample, int
|
|||||||
int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
|
int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
|
int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
|
||||||
|
|
||||||
if(x < sx + sw && y < sy + sh)
|
if(x < sx + sw && y < sy + sh) {
|
||||||
kernel_branched_path_trace(NULL, buffer, rng_state, sample, x, y, offset, stride);
|
KernelGlobals kg;
|
||||||
|
kernel_branched_path_trace(&kg, buffer, rng_state, sample, x, y, offset, stride);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@@ -154,8 +158,9 @@ kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int
|
|||||||
int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
|
int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
|
int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
|
||||||
|
|
||||||
if(x < sx + sw && y < sy + sh)
|
if(x < sx + sw && y < sy + sh) {
|
||||||
kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
|
kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C" __global__ void
|
extern "C" __global__ void
|
||||||
@@ -165,8 +170,9 @@ kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scal
|
|||||||
int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
|
int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
|
int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
|
||||||
|
|
||||||
if(x < sx + sw && y < sy + sh)
|
if(x < sx + sw && y < sy + sh) {
|
||||||
kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
|
kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C" __global__ void
|
extern "C" __global__ void
|
||||||
@@ -183,7 +189,8 @@ kernel_cuda_shader(uint4 *input,
|
|||||||
int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
|
int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
|
|
||||||
if(x < sx + sw) {
|
if(x < sx + sw) {
|
||||||
kernel_shader_evaluate(NULL,
|
KernelGlobals kg;
|
||||||
|
kernel_shader_evaluate(&kg,
|
||||||
input,
|
input,
|
||||||
output,
|
output,
|
||||||
output_luma,
|
output_luma,
|
||||||
@@ -200,8 +207,10 @@ kernel_cuda_bake(uint4 *input, float4 *output, int type, int filter, int sx, int
|
|||||||
{
|
{
|
||||||
int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
|
int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
|
|
||||||
if(x < sx + sw)
|
if(x < sx + sw) {
|
||||||
kernel_bake_evaluate(NULL, input, output, (ShaderEvalType)type, filter, x, offset, sample);
|
KernelGlobals kg;
|
||||||
|
kernel_bake_evaluate(&kg, input, output, (ShaderEvalType)type, filter, x, offset, sample);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user