Code refactor: use split variance calculation for mega kernels too.

There is no significant difference in denoised benchmark scenes and
denoising ctests, so might as well make it all consistent.
This commit is contained in:
Brecht Van Lommel
2017-09-27 01:03:50 +02:00
parent e3e16cecc4
commit 12f4538205
9 changed files with 35 additions and 96 deletions

View File

@@ -176,10 +176,10 @@ public:
KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_byte_kernel; KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_byte_kernel;
KernelFunctions<void(*)(KernelGlobals *, uint4 *, float4 *, float*, int, int, int, int, int)> shader_kernel; KernelFunctions<void(*)(KernelGlobals *, uint4 *, float4 *, float*, int, int, int, int, int)> shader_kernel;
KernelFunctions<void(*)(int, TilesInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int, bool)> filter_divide_shadow_kernel; KernelFunctions<void(*)(int, TilesInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int)> filter_divide_shadow_kernel;
KernelFunctions<void(*)(int, TilesInfo*, int, int, int, int, float*, float*, int*, int, int, bool)> filter_get_feature_kernel; KernelFunctions<void(*)(int, TilesInfo*, int, int, int, int, float*, float*, int*, int, int)> filter_get_feature_kernel;
KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_detect_outliers_kernel; KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_detect_outliers_kernel;
KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_combine_halves_kernel; KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_combine_halves_kernel;
KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel; KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel;
KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_blur_kernel; KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_blur_kernel;
@@ -563,8 +563,7 @@ public:
(float*) buffer_variance_ptr, (float*) buffer_variance_ptr,
&task->rect.x, &task->rect.x,
task->render_buffer.pass_stride, task->render_buffer.pass_stride,
task->render_buffer.denoising_data_offset, task->render_buffer.denoising_data_offset);
use_split_kernel);
} }
} }
return true; return true;
@@ -587,8 +586,7 @@ public:
(float*) variance_ptr, (float*) variance_ptr,
&task->rect.x, &task->rect.x,
task->render_buffer.pass_stride, task->render_buffer.pass_stride,
task->render_buffer.denoising_data_offset, task->render_buffer.denoising_data_offset);
use_split_kernel);
} }
} }
return true; return true;

View File

@@ -1173,7 +1173,6 @@ public:
task->rect.z-task->rect.x, task->rect.z-task->rect.x,
task->rect.w-task->rect.y); task->rect.w-task->rect.y);
bool use_split_variance = use_split_kernel();
void *args[] = {&task->render_buffer.samples, void *args[] = {&task->render_buffer.samples,
&task->tiles_mem.device_pointer, &task->tiles_mem.device_pointer,
&a_ptr, &a_ptr,
@@ -1183,8 +1182,7 @@ public:
&buffer_variance_ptr, &buffer_variance_ptr,
&task->rect, &task->rect,
&task->render_buffer.pass_stride, &task->render_buffer.pass_stride,
&task->render_buffer.denoising_data_offset, &task->render_buffer.denoising_data_offset};
&use_split_variance};
CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args); CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
cuda_assert(cuCtxSynchronize()); cuda_assert(cuCtxSynchronize());
@@ -1209,7 +1207,6 @@ public:
task->rect.z-task->rect.x, task->rect.z-task->rect.x,
task->rect.w-task->rect.y); task->rect.w-task->rect.y);
bool use_split_variance = use_split_kernel();
void *args[] = {&task->render_buffer.samples, void *args[] = {&task->render_buffer.samples,
&task->tiles_mem.device_pointer, &task->tiles_mem.device_pointer,
&mean_offset, &mean_offset,
@@ -1218,8 +1215,7 @@ public:
&variance_ptr, &variance_ptr,
&task->rect, &task->rect,
&task->render_buffer.pass_stride, &task->render_buffer.pass_stride,
&task->render_buffer.denoising_data_offset, &task->render_buffer.denoising_data_offset};
&use_split_variance};
CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args); CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
cuda_assert(cuCtxSynchronize()); cuda_assert(cuCtxSynchronize());

View File

@@ -982,7 +982,6 @@ bool OpenCLDeviceBase::denoising_divide_shadow(device_ptr a_ptr,
cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow")); cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow"));
char split_kernel = is_split_kernel()? 1 : 0;
kernel_set_args(ckFilterDivideShadow, 0, kernel_set_args(ckFilterDivideShadow, 0,
task->render_buffer.samples, task->render_buffer.samples,
tiles_mem, tiles_mem,
@@ -993,8 +992,7 @@ bool OpenCLDeviceBase::denoising_divide_shadow(device_ptr a_ptr,
buffer_variance_mem, buffer_variance_mem,
task->rect, task->rect,
task->render_buffer.pass_stride, task->render_buffer.pass_stride,
task->render_buffer.denoising_data_offset, task->render_buffer.denoising_data_offset);
split_kernel);
enqueue_kernel(ckFilterDivideShadow, enqueue_kernel(ckFilterDivideShadow,
task->rect.z-task->rect.x, task->rect.z-task->rect.x,
task->rect.w-task->rect.y); task->rect.w-task->rect.y);
@@ -1015,7 +1013,6 @@ bool OpenCLDeviceBase::denoising_get_feature(int mean_offset,
cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature")); cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature"));
char split_kernel = is_split_kernel()? 1 : 0;
kernel_set_args(ckFilterGetFeature, 0, kernel_set_args(ckFilterGetFeature, 0,
task->render_buffer.samples, task->render_buffer.samples,
tiles_mem, tiles_mem,
@@ -1025,8 +1022,7 @@ bool OpenCLDeviceBase::denoising_get_feature(int mean_offset,
variance_mem, variance_mem,
task->rect, task->rect,
task->render_buffer.pass_stride, task->render_buffer.pass_stride,
task->render_buffer.denoising_data_offset, task->render_buffer.denoising_data_offset);
split_kernel);
enqueue_kernel(ckFilterGetFeature, enqueue_kernel(ckFilterGetFeature,
task->rect.z-task->rect.x, task->rect.z-task->rect.x,
task->rect.w-task->rect.y); task->rect.w-task->rect.y);

View File

@@ -35,8 +35,7 @@ ccl_device void kernel_filter_divide_shadow(int sample,
ccl_global float *bufferVariance, ccl_global float *bufferVariance,
int4 rect, int4 rect,
int buffer_pass_stride, int buffer_pass_stride,
int buffer_denoising_offset, int buffer_denoising_offset)
bool use_split_variance)
{ {
int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2); int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2);
int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2); int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2);
@@ -57,10 +56,12 @@ ccl_device void kernel_filter_divide_shadow(int sample,
float varB = center_buffer[5]; float varB = center_buffer[5];
int odd_sample = (sample+1)/2; int odd_sample = (sample+1)/2;
int even_sample = sample/2; int even_sample = sample/2;
if(use_split_variance) {
varA = max(0.0f, varA - unfilteredA[idx]*unfilteredA[idx]*odd_sample); /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance
varB = max(0.0f, varB - unfilteredB[idx]*unfilteredB[idx]*even_sample); * update does not work efficiently with atomics in the kernel. */
} varA = max(0.0f, varA - unfilteredA[idx]*unfilteredA[idx]*odd_sample);
varB = max(0.0f, varB - unfilteredB[idx]*unfilteredB[idx]*even_sample);
varA /= max(odd_sample - 1, 1); varA /= max(odd_sample - 1, 1);
varB /= max(even_sample - 1, 1); varB /= max(even_sample - 1, 1);
@@ -84,8 +85,7 @@ ccl_device void kernel_filter_get_feature(int sample,
ccl_global float *mean, ccl_global float *mean,
ccl_global float *variance, ccl_global float *variance,
int4 rect, int buffer_pass_stride, int4 rect, int buffer_pass_stride,
int buffer_denoising_offset, int buffer_denoising_offset)
bool use_split_variance)
{ {
int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2); int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2);
int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2); int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2);
@@ -97,12 +97,9 @@ ccl_device void kernel_filter_get_feature(int sample,
mean[idx] = center_buffer[m_offset] / sample; mean[idx] = center_buffer[m_offset] / sample;
if(sample > 1) { if(sample > 1) {
if(use_split_variance) { /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance
variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1))); * update does not work efficiently with atomics in the kernel. */
} variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1)));
else {
variance[idx] = center_buffer[v_offset] / (sample * (sample-1));
}
} }
else { else {
/* Can't compute variance with single sample, just set it very high. */ /* Can't compute variance with single sample, just set it very high. */

View File

@@ -67,18 +67,7 @@ ccl_device_inline void kernel_write_pass_float_variance(ccl_global float *buffer
/* The online one-pass variance update that's used for the megakernel can't easily be implemented /* The online one-pass variance update that's used for the megakernel can't easily be implemented
* with atomics, so for the split kernel the E[x^2] - 1/N * (E[x])^2 fallback is used. */ * with atomics, so for the split kernel the E[x^2] - 1/N * (E[x])^2 fallback is used. */
# ifdef __SPLIT_KERNEL__
kernel_write_pass_float(buffer+1, sample, value*value); kernel_write_pass_float(buffer+1, sample, value*value);
# else
if(sample == 0) {
kernel_write_pass_float(buffer+1, sample, 0.0f);
}
else {
float new_mean = buffer[0] * (1.0f / (sample + 1));
float old_mean = (buffer[0] - value) * (1.0f / sample);
kernel_write_pass_float(buffer+1, sample, (value - new_mean) * (value - old_mean));
}
# endif
} }
# if defined(__SPLIT_KERNEL__) # if defined(__SPLIT_KERNEL__)
@@ -95,19 +84,7 @@ ccl_device_inline void kernel_write_pass_float3_unaligned(ccl_global float *buff
ccl_device_inline void kernel_write_pass_float3_variance(ccl_global float *buffer, int sample, float3 value) ccl_device_inline void kernel_write_pass_float3_variance(ccl_global float *buffer, int sample, float3 value)
{ {
kernel_write_pass_float3_unaligned(buffer, sample, value); kernel_write_pass_float3_unaligned(buffer, sample, value);
# ifdef __SPLIT_KERNEL__
kernel_write_pass_float3_unaligned(buffer+3, sample, value*value); kernel_write_pass_float3_unaligned(buffer+3, sample, value*value);
# else
if(sample == 0) {
kernel_write_pass_float3_unaligned(buffer+3, sample, make_float3(0.0f, 0.0f, 0.0f));
}
else {
float3 sum = make_float3(buffer[0], buffer[1], buffer[2]);
float3 new_mean = sum * (1.0f / (sample + 1));
float3 old_mean = (sum - value) * (1.0f / sample);
kernel_write_pass_float3_unaligned(buffer+3, sample, (value - new_mean) * (value - old_mean));
}
# endif
} }
ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg, ccl_global float *buffer, ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg, ccl_global float *buffer,
@@ -125,18 +102,7 @@ ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg, ccl_glob
kernel_write_pass_float(buffer+1, sample/2, path_total_shaded); kernel_write_pass_float(buffer+1, sample/2, path_total_shaded);
float value = path_total_shaded / max(path_total, 1e-7f); float value = path_total_shaded / max(path_total, 1e-7f);
# ifdef __SPLIT_KERNEL__
kernel_write_pass_float(buffer+2, sample/2, value*value); kernel_write_pass_float(buffer+2, sample/2, value*value);
# else
if(sample < 2) {
kernel_write_pass_float(buffer+2, sample/2, 0.0f);
}
else {
float old_value = (buffer[1] - path_total_shaded) / max(buffer[0] - path_total, 1e-7f);
float new_value = buffer[1] / max(buffer[0], 1e-7f);
kernel_write_pass_float(buffer+2, sample, (value - new_value) * (value - old_value));
}
# endif
} }
#endif /* __DENOISING_FEATURES__ */ #endif /* __DENOISING_FEATURES__ */

View File

@@ -27,8 +27,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
float *bufferV, float *bufferV,
int* prefilter_rect, int* prefilter_rect,
int buffer_pass_stride, int buffer_pass_stride,
int buffer_denoising_offset, int buffer_denoising_offset);
bool use_split_variance);
void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample, void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
TilesInfo *tiles, TilesInfo *tiles,
@@ -40,8 +39,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
float *variance, float *variance,
int* prefilter_rect, int* prefilter_rect,
int buffer_pass_stride, int buffer_pass_stride,
int buffer_denoising_offset, int buffer_denoising_offset);
bool use_split_variance);
void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, int y, void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, int y,
ccl_global float *image, ccl_global float *image,

View File

@@ -45,8 +45,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
float *bufferVariance, float *bufferVariance,
int* prefilter_rect, int* prefilter_rect,
int buffer_pass_stride, int buffer_pass_stride,
int buffer_denoising_offset, int buffer_denoising_offset)
bool use_split_variance)
{ {
#ifdef KERNEL_STUB #ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, filter_divide_shadow); STUB_ASSERT(KERNEL_ARCH, filter_divide_shadow);
@@ -60,8 +59,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
bufferVariance, bufferVariance,
load_int4(prefilter_rect), load_int4(prefilter_rect),
buffer_pass_stride, buffer_pass_stride,
buffer_denoising_offset, buffer_denoising_offset);
use_split_variance);
#endif #endif
} }
@@ -74,8 +72,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
float *mean, float *variance, float *mean, float *variance,
int* prefilter_rect, int* prefilter_rect,
int buffer_pass_stride, int buffer_pass_stride,
int buffer_denoising_offset, int buffer_denoising_offset)
bool use_split_variance)
{ {
#ifdef KERNEL_STUB #ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, filter_get_feature); STUB_ASSERT(KERNEL_ARCH, filter_get_feature);
@@ -86,8 +83,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
mean, variance, mean, variance,
load_int4(prefilter_rect), load_int4(prefilter_rect),
buffer_pass_stride, buffer_pass_stride,
buffer_denoising_offset, buffer_denoising_offset);
use_split_variance);
#endif #endif
} }

View File

@@ -37,8 +37,7 @@ kernel_cuda_filter_divide_shadow(int sample,
float *bufferVariance, float *bufferVariance,
int4 prefilter_rect, int4 prefilter_rect,
int buffer_pass_stride, int buffer_pass_stride,
int buffer_denoising_offset, int buffer_denoising_offset)
bool use_split_variance)
{ {
int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
@@ -53,8 +52,7 @@ kernel_cuda_filter_divide_shadow(int sample,
bufferVariance, bufferVariance,
prefilter_rect, prefilter_rect,
buffer_pass_stride, buffer_pass_stride,
buffer_denoising_offset, buffer_denoising_offset);
use_split_variance);
} }
} }
@@ -68,8 +66,7 @@ kernel_cuda_filter_get_feature(int sample,
float *variance, float *variance,
int4 prefilter_rect, int4 prefilter_rect,
int buffer_pass_stride, int buffer_pass_stride,
int buffer_denoising_offset, int buffer_denoising_offset)
bool use_split_variance)
{ {
int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
@@ -81,8 +78,7 @@ kernel_cuda_filter_get_feature(int sample,
mean, variance, mean, variance,
prefilter_rect, prefilter_rect,
buffer_pass_stride, buffer_pass_stride,
buffer_denoising_offset, buffer_denoising_offset);
use_split_variance);
} }
} }

View File

@@ -31,8 +31,7 @@ __kernel void kernel_ocl_filter_divide_shadow(int sample,
ccl_global float *bufferVariance, ccl_global float *bufferVariance,
int4 prefilter_rect, int4 prefilter_rect,
int buffer_pass_stride, int buffer_pass_stride,
int buffer_denoising_offset, int buffer_denoising_offset)
char use_split_variance)
{ {
int x = prefilter_rect.x + get_global_id(0); int x = prefilter_rect.x + get_global_id(0);
int y = prefilter_rect.y + get_global_id(1); int y = prefilter_rect.y + get_global_id(1);
@@ -47,8 +46,7 @@ __kernel void kernel_ocl_filter_divide_shadow(int sample,
bufferVariance, bufferVariance,
prefilter_rect, prefilter_rect,
buffer_pass_stride, buffer_pass_stride,
buffer_denoising_offset, buffer_denoising_offset);
use_split_variance);
} }
} }
@@ -60,8 +58,7 @@ __kernel void kernel_ocl_filter_get_feature(int sample,
ccl_global float *variance, ccl_global float *variance,
int4 prefilter_rect, int4 prefilter_rect,
int buffer_pass_stride, int buffer_pass_stride,
int buffer_denoising_offset, int buffer_denoising_offset)
char use_split_variance)
{ {
int x = prefilter_rect.x + get_global_id(0); int x = prefilter_rect.x + get_global_id(0);
int y = prefilter_rect.y + get_global_id(1); int y = prefilter_rect.y + get_global_id(1);
@@ -73,8 +70,7 @@ __kernel void kernel_ocl_filter_get_feature(int sample,
mean, variance, mean, variance,
prefilter_rect, prefilter_rect,
buffer_pass_stride, buffer_pass_stride,
buffer_denoising_offset, buffer_denoising_offset);
use_split_variance);
} }
} }