Cycles: Optionally output luminance from the shader evaluation kernel
This makes it possible to move some parts of evaluation from host to the device and hopefully reduce memory usage by avoid having full RGBA buffer on the host. Reviewers: juicyfruit, lukasstockner97, brecht Reviewed By: lukasstockner97, brecht Differential Revision: https://developer.blender.org/D1702
This commit is contained in:
@@ -343,7 +343,7 @@ public:
|
|||||||
#ifdef WITH_OSL
|
#ifdef WITH_OSL
|
||||||
OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
|
OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
|
||||||
#endif
|
#endif
|
||||||
void(*shader_kernel)(KernelGlobals*, uint4*, float4*, int, int, int, int);
|
void(*shader_kernel)(KernelGlobals*, uint4*, float4*, float*, int, int, int, int);
|
||||||
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
|
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
|
||||||
if(system_cpu_support_avx2())
|
if(system_cpu_support_avx2())
|
||||||
@@ -374,8 +374,14 @@ public:
|
|||||||
|
|
||||||
for(int sample = 0; sample < task.num_samples; sample++) {
|
for(int sample = 0; sample < task.num_samples; sample++) {
|
||||||
for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
|
for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
|
||||||
shader_kernel(&kg, (uint4*)task.shader_input, (float4*)task.shader_output,
|
shader_kernel(&kg,
|
||||||
task.shader_eval_type, x, task.offset, sample);
|
(uint4*)task.shader_input,
|
||||||
|
(float4*)task.shader_output,
|
||||||
|
(float*)task.shader_output_luma,
|
||||||
|
task.shader_eval_type,
|
||||||
|
x,
|
||||||
|
task.offset,
|
||||||
|
sample);
|
||||||
|
|
||||||
if(task.get_cancel() || task_pool.canceled())
|
if(task.get_cancel() || task_pool.canceled())
|
||||||
break;
|
break;
|
||||||
|
@@ -726,6 +726,7 @@ public:
|
|||||||
CUfunction cuShader;
|
CUfunction cuShader;
|
||||||
CUdeviceptr d_input = cuda_device_ptr(task.shader_input);
|
CUdeviceptr d_input = cuda_device_ptr(task.shader_input);
|
||||||
CUdeviceptr d_output = cuda_device_ptr(task.shader_output);
|
CUdeviceptr d_output = cuda_device_ptr(task.shader_output);
|
||||||
|
CUdeviceptr d_output_luma = cuda_device_ptr(task.shader_output_luma);
|
||||||
|
|
||||||
/* get kernel function */
|
/* get kernel function */
|
||||||
if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
|
if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
|
||||||
@@ -747,13 +748,18 @@ public:
|
|||||||
int shader_w = min(shader_chunk_size, end - shader_x);
|
int shader_w = min(shader_chunk_size, end - shader_x);
|
||||||
|
|
||||||
/* pass in parameters */
|
/* pass in parameters */
|
||||||
void *args[] = {&d_input,
|
void *args[8];
|
||||||
&d_output,
|
int arg = 0;
|
||||||
&task.shader_eval_type,
|
args[arg++] = &d_input;
|
||||||
&shader_x,
|
args[arg++] = &d_output;
|
||||||
&shader_w,
|
if(task.shader_eval_type < SHADER_EVAL_BAKE) {
|
||||||
&offset,
|
args[arg++] = &d_output_luma;
|
||||||
&sample};
|
}
|
||||||
|
args[arg++] = &task.shader_eval_type;
|
||||||
|
args[arg++] = &shader_x;
|
||||||
|
args[arg++] = &shader_w;
|
||||||
|
args[arg++] = &offset;
|
||||||
|
args[arg++] = &sample;
|
||||||
|
|
||||||
/* launch kernel */
|
/* launch kernel */
|
||||||
int threads_per_block;
|
int threads_per_block;
|
||||||
|
@@ -316,6 +316,7 @@ public:
|
|||||||
if(task.rgba_half) subtask.rgba_half = sub.ptr_map[task.rgba_half];
|
if(task.rgba_half) subtask.rgba_half = sub.ptr_map[task.rgba_half];
|
||||||
if(task.shader_input) subtask.shader_input = sub.ptr_map[task.shader_input];
|
if(task.shader_input) subtask.shader_input = sub.ptr_map[task.shader_input];
|
||||||
if(task.shader_output) subtask.shader_output = sub.ptr_map[task.shader_output];
|
if(task.shader_output) subtask.shader_output = sub.ptr_map[task.shader_output];
|
||||||
|
if(task.shader_output_luma) subtask.shader_output_luma = sub.ptr_map[task.shader_output_luma];
|
||||||
|
|
||||||
sub.device->task_add(subtask);
|
sub.device->task_add(subtask);
|
||||||
}
|
}
|
||||||
|
@@ -648,6 +648,9 @@ protected:
|
|||||||
if(task.shader_output)
|
if(task.shader_output)
|
||||||
task.shader_output = device_ptr_from_client_pointer(task.shader_output);
|
task.shader_output = device_ptr_from_client_pointer(task.shader_output);
|
||||||
|
|
||||||
|
if(task.shader_output)luma)
|
||||||
|
task.shader_output_luma = device_ptr_from_client_pointer(task.shader_output_luma);
|
||||||
|
|
||||||
|
|
||||||
task.acquire_tile = function_bind(&DeviceServer::task_acquire_tile, this, _1, _2);
|
task.acquire_tile = function_bind(&DeviceServer::task_acquire_tile, this, _1, _2);
|
||||||
task.release_tile = function_bind(&DeviceServer::task_release_tile, this, _1);
|
task.release_tile = function_bind(&DeviceServer::task_release_tile, this, _1);
|
||||||
|
@@ -132,7 +132,7 @@ public:
|
|||||||
archive & type & task.x & task.y & task.w & task.h;
|
archive & type & task.x & task.y & task.w & task.h;
|
||||||
archive & task.rgba_byte & task.rgba_half & task.buffer & task.sample & task.num_samples;
|
archive & task.rgba_byte & task.rgba_half & task.buffer & task.sample & task.num_samples;
|
||||||
archive & task.offset & task.stride;
|
archive & task.offset & task.stride;
|
||||||
archive & task.shader_input & task.shader_output & task.shader_eval_type;
|
archive & task.shader_input & task.shader_output & task.shader_output_luma & task.shader_eval_type;
|
||||||
archive & task.shader_x & task.shader_w;
|
archive & task.shader_x & task.shader_w;
|
||||||
archive & task.need_finish_queue;
|
archive & task.need_finish_queue;
|
||||||
}
|
}
|
||||||
@@ -291,7 +291,7 @@ public:
|
|||||||
*archive & type & task.x & task.y & task.w & task.h;
|
*archive & type & task.x & task.y & task.w & task.h;
|
||||||
*archive & task.rgba_byte & task.rgba_half & task.buffer & task.sample & task.num_samples;
|
*archive & task.rgba_byte & task.rgba_half & task.buffer & task.sample & task.num_samples;
|
||||||
*archive & task.offset & task.stride;
|
*archive & task.offset & task.stride;
|
||||||
*archive & task.shader_input & task.shader_output & task.shader_eval_type;
|
*archive & task.shader_input & task.shader_output & task.shader_output_luma & task.shader_eval_type;
|
||||||
*archive & task.shader_x & task.shader_w;
|
*archive & task.shader_x & task.shader_w;
|
||||||
*archive & task.need_finish_queue;
|
*archive & task.need_finish_queue;
|
||||||
|
|
||||||
|
@@ -1304,6 +1304,7 @@ public:
|
|||||||
cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
|
cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
|
||||||
cl_mem d_input = CL_MEM_PTR(task.shader_input);
|
cl_mem d_input = CL_MEM_PTR(task.shader_input);
|
||||||
cl_mem d_output = CL_MEM_PTR(task.shader_output);
|
cl_mem d_output = CL_MEM_PTR(task.shader_output);
|
||||||
|
cl_mem d_output_luma = CL_MEM_PTR(task.shader_output_luma);
|
||||||
cl_int d_shader_eval_type = task.shader_eval_type;
|
cl_int d_shader_eval_type = task.shader_eval_type;
|
||||||
cl_int d_shader_x = task.shader_x;
|
cl_int d_shader_x = task.shader_x;
|
||||||
cl_int d_shader_w = task.shader_w;
|
cl_int d_shader_w = task.shader_w;
|
||||||
@@ -1330,6 +1331,12 @@ public:
|
|||||||
d_input,
|
d_input,
|
||||||
d_output);
|
d_output);
|
||||||
|
|
||||||
|
if(task.shader_eval_type < SHADER_EVAL_BAKE) {
|
||||||
|
start_arg_index += kernel_set_args(kernel,
|
||||||
|
start_arg_index,
|
||||||
|
d_output_luma);
|
||||||
|
}
|
||||||
|
|
||||||
#define KERNEL_TEX(type, ttype, name) \
|
#define KERNEL_TEX(type, ttype, name) \
|
||||||
set_kernel_arg_mem(kernel, &start_arg_index, #name);
|
set_kernel_arg_mem(kernel, &start_arg_index, #name);
|
||||||
#include "kernel_textures.h"
|
#include "kernel_textures.h"
|
||||||
|
@@ -29,7 +29,7 @@ CCL_NAMESPACE_BEGIN
|
|||||||
DeviceTask::DeviceTask(Type type_)
|
DeviceTask::DeviceTask(Type type_)
|
||||||
: type(type_), x(0), y(0), w(0), h(0), rgba_byte(0), rgba_half(0), buffer(0),
|
: type(type_), x(0), y(0), w(0), h(0), rgba_byte(0), rgba_half(0), buffer(0),
|
||||||
sample(0), num_samples(1),
|
sample(0), num_samples(1),
|
||||||
shader_input(0), shader_output(0),
|
shader_input(0), shader_output(0), shader_output_luma(0),
|
||||||
shader_eval_type(0), shader_x(0), shader_w(0)
|
shader_eval_type(0), shader_x(0), shader_w(0)
|
||||||
{
|
{
|
||||||
last_update_time = time_dt();
|
last_update_time = time_dt();
|
||||||
|
@@ -46,7 +46,7 @@ public:
|
|||||||
int offset, stride;
|
int offset, stride;
|
||||||
|
|
||||||
device_ptr shader_input;
|
device_ptr shader_input;
|
||||||
device_ptr shader_output;
|
device_ptr shader_output, shader_output_luma;
|
||||||
int shader_eval_type;
|
int shader_eval_type;
|
||||||
int shader_x, shader_w;
|
int shader_x, shader_w;
|
||||||
|
|
||||||
|
@@ -453,7 +453,13 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
|
|||||||
output[i] += make_float4(out.x, out.y, out.z, 1.0f) * output_fac;
|
output[i] += make_float4(out.x, out.y, out.z, 1.0f) * output_fac;
|
||||||
}
|
}
|
||||||
|
|
||||||
ccl_device void kernel_shader_evaluate(KernelGlobals *kg, ccl_global uint4 *input, ccl_global float4 *output, ShaderEvalType type, int i, int sample)
|
ccl_device void kernel_shader_evaluate(KernelGlobals *kg,
|
||||||
|
ccl_global uint4 *input,
|
||||||
|
ccl_global float4 *output,
|
||||||
|
ccl_global float *output_luma,
|
||||||
|
ShaderEvalType type,
|
||||||
|
int i,
|
||||||
|
int sample)
|
||||||
{
|
{
|
||||||
ShaderData sd;
|
ShaderData sd;
|
||||||
uint4 in = input[i];
|
uint4 in = input[i];
|
||||||
@@ -500,10 +506,22 @@ ccl_device void kernel_shader_evaluate(KernelGlobals *kg, ccl_global uint4 *inpu
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* write output */
|
/* write output */
|
||||||
if(sample == 0)
|
if(sample == 0) {
|
||||||
output[i] = make_float4(out.x, out.y, out.z, 0.0f);
|
if(output != NULL) {
|
||||||
else
|
output[i] = make_float4(out.x, out.y, out.z, 0.0f);
|
||||||
output[i] += make_float4(out.x, out.y, out.z, 0.0f);
|
}
|
||||||
|
if(output_luma != NULL) {
|
||||||
|
output_luma[i] = average(out);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
if(output != NULL) {
|
||||||
|
output[i] += make_float4(out.x, out.y, out.z, 0.0f);
|
||||||
|
}
|
||||||
|
if(output_luma != NULL) {
|
||||||
|
output_luma[i] += average(out);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
CCL_NAMESPACE_END
|
CCL_NAMESPACE_END
|
||||||
|
@@ -42,6 +42,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
|
|||||||
void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
|
void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
|
||||||
uint4 *input,
|
uint4 *input,
|
||||||
float4 *output,
|
float4 *output,
|
||||||
|
float *output_luma,
|
||||||
int type,
|
int type,
|
||||||
int i,
|
int i,
|
||||||
int offset,
|
int offset,
|
||||||
|
@@ -99,12 +99,14 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
|
|||||||
void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
|
void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
|
||||||
uint4 *input,
|
uint4 *input,
|
||||||
float4 *output,
|
float4 *output,
|
||||||
|
float *output_luma,
|
||||||
int type,
|
int type,
|
||||||
int i,
|
int i,
|
||||||
int offset,
|
int offset,
|
||||||
int sample)
|
int sample)
|
||||||
{
|
{
|
||||||
if(type >= SHADER_EVAL_BAKE) {
|
if(type >= SHADER_EVAL_BAKE) {
|
||||||
|
kernel_assert(output_luma == NULL);
|
||||||
kernel_bake_evaluate(kg,
|
kernel_bake_evaluate(kg,
|
||||||
input,
|
input,
|
||||||
output,
|
output,
|
||||||
@@ -117,6 +119,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
|
|||||||
kernel_shader_evaluate(kg,
|
kernel_shader_evaluate(kg,
|
||||||
input,
|
input,
|
||||||
output,
|
output,
|
||||||
|
output_luma,
|
||||||
(ShaderEvalType)type,
|
(ShaderEvalType)type,
|
||||||
i,
|
i,
|
||||||
sample);
|
sample);
|
||||||
|
@@ -159,12 +159,26 @@ kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scal
|
|||||||
|
|
||||||
extern "C" __global__ void
|
extern "C" __global__ void
|
||||||
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||||
kernel_cuda_shader(uint4 *input, float4 *output, int type, int sx, int sw, int offset, int sample)
|
kernel_cuda_shader(uint4 *input,
|
||||||
|
float4 *output,
|
||||||
|
float *output_luma,
|
||||||
|
int type,
|
||||||
|
int sx,
|
||||||
|
int sw,
|
||||||
|
int offset,
|
||||||
|
int sample)
|
||||||
{
|
{
|
||||||
int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
|
int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
|
|
||||||
if(x < sx + sw)
|
if(x < sx + sw) {
|
||||||
kernel_shader_evaluate(NULL, input, output, (ShaderEvalType)type, x, sample);
|
kernel_shader_evaluate(NULL,
|
||||||
|
input,
|
||||||
|
output,
|
||||||
|
output_luma,
|
||||||
|
(ShaderEvalType)type,
|
||||||
|
x,
|
||||||
|
sample);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C" __global__ void
|
extern "C" __global__ void
|
||||||
|
@@ -61,6 +61,7 @@ __kernel void kernel_ocl_shader(
|
|||||||
ccl_constant KernelData *data,
|
ccl_constant KernelData *data,
|
||||||
ccl_global uint4 *input,
|
ccl_global uint4 *input,
|
||||||
ccl_global float4 *output,
|
ccl_global float4 *output,
|
||||||
|
ccl_global float *output_luma,
|
||||||
|
|
||||||
#define KERNEL_TEX(type, ttype, name) \
|
#define KERNEL_TEX(type, ttype, name) \
|
||||||
ccl_global type *name,
|
ccl_global type *name,
|
||||||
@@ -78,8 +79,15 @@ __kernel void kernel_ocl_shader(
|
|||||||
|
|
||||||
int x = sx + get_global_id(0);
|
int x = sx + get_global_id(0);
|
||||||
|
|
||||||
if(x < sx + sw)
|
if(x < sx + sw) {
|
||||||
kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, x, sample);
|
kernel_shader_evaluate(kg,
|
||||||
|
input,
|
||||||
|
output,
|
||||||
|
output_luma,
|
||||||
|
(ShaderEvalType)type,
|
||||||
|
x,
|
||||||
|
sample);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__kernel void kernel_ocl_bake(
|
__kernel void kernel_ocl_bake(
|
||||||
|
Reference in New Issue
Block a user