Files
blender/intern/cycles/device/opencl/opencl_split.cpp

1308 lines
44 KiB
C++
Raw Normal View History

/*
* Copyright 2011-2013 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifdef WITH_OPENCL
#include "opencl.h"
#include "buffers.h"
#include "kernel_types.h"
#include "util_md5.h"
#include "util_path.h"
#include "util_time.h"
CCL_NAMESPACE_BEGIN
/* TODO(sergey): This is to keep tile split on OpenCL level working
* for now, since without this view-port render does not work as it
* should.
*
* Ideally it'll be done on the higher level, but we need to get ready
* for merge rather soon, so let's keep split logic private here in
* the file.
*/
class SplitRenderTile : public RenderTile {
public:
SplitRenderTile()
: RenderTile(),
buffer_offset_x(0),
buffer_offset_y(0),
rng_state_offset_x(0),
rng_state_offset_y(0),
buffer_rng_state_stride(0) {}
explicit SplitRenderTile(RenderTile& tile)
: RenderTile(),
buffer_offset_x(0),
buffer_offset_y(0),
rng_state_offset_x(0),
rng_state_offset_y(0),
buffer_rng_state_stride(0)
{
x = tile.x;
y = tile.y;
w = tile.w;
h = tile.h;
start_sample = tile.start_sample;
num_samples = tile.num_samples;
sample = tile.sample;
resolution = tile.resolution;
offset = tile.offset;
stride = tile.stride;
buffer = tile.buffer;
rng_state = tile.rng_state;
buffers = tile.buffers;
}
/* Split kernel is device global memory constrained;
* hence split kernel cant render big tile size's in
* one go. If the user sets a big tile size (big tile size
* is a term relative to the available device global memory),
* we split the tile further and then call path_trace on
* each of those split tiles. The following variables declared,
* assist in achieving that purpose
*/
int buffer_offset_x;
int buffer_offset_y;
int rng_state_offset_x;
int rng_state_offset_y;
int buffer_rng_state_stride;
};
/* OpenCLDeviceSplitKernel's declaration/definition. */
class OpenCLDeviceSplitKernel : public OpenCLDeviceBase
{
public:
/* Kernel declaration. */
OpenCLProgram program_data_init;
OpenCLProgram program_scene_intersect;
OpenCLProgram program_lamp_emission;
OpenCLProgram program_queue_enqueue;
OpenCLProgram program_background_buffer_update;
OpenCLProgram program_shader_eval;
OpenCLProgram program_holdout_emission_blurring_pathtermination_ao;
OpenCLProgram program_direct_lighting;
OpenCLProgram program_shadow_blocked;
OpenCLProgram program_next_iteration_setup;
OpenCLProgram program_sum_all_radiance;
/* Global memory variables [porting]; These memory is used for
* co-operation between different kernels; Data written by one
* kernel will be available to another kernel via this global
* memory.
*/
cl_mem rng_coop;
cl_mem throughput_coop;
cl_mem L_transparent_coop;
cl_mem PathRadiance_coop;
cl_mem Ray_coop;
cl_mem PathState_coop;
cl_mem Intersection_coop;
cl_mem kgbuffer; /* KernelGlobals buffer. */
/* Global buffers for ShaderData. */
cl_mem sd; /* ShaderData used in the main path-iteration loop. */
cl_mem sd_DL_shadow; /* ShaderData used in Direct Lighting and
* shadow_blocked kernel.
*/
/* Global memory required for shadow blocked and accum_radiance. */
cl_mem BSDFEval_coop;
cl_mem ISLamp_coop;
cl_mem LightRay_coop;
cl_mem AOAlpha_coop;
cl_mem AOBSDF_coop;
cl_mem AOLightRay_coop;
cl_mem Intersection_coop_shadow;
#ifdef WITH_CYCLES_DEBUG
/* DebugData memory */
cl_mem debugdata_coop;
#endif
/* Global state array that tracks ray state. */
cl_mem ray_state;
/* Per sample buffers. */
cl_mem per_sample_output_buffers;
/* Denotes which sample each ray is being processed for. */
cl_mem work_array;
/* Queue */
cl_mem Queue_data; /* Array of size queuesize * num_queues * sizeof(int). */
cl_mem Queue_index; /* Array of size num_queues * sizeof(int);
* Tracks the size of each queue.
*/
/* Flag to make sceneintersect and lampemission kernel use queues. */
cl_mem use_queues_flag;
/* Amount of memory in output buffer associated with one pixel/thread. */
size_t per_thread_output_buffer_size;
/* Total allocatable available device memory. */
size_t total_allocatable_memory;
/* host version of ray_state; Used in checking host path-iteration
* termination.
*/
char *hostRayStateArray;
/* Number of path-iterations to be done in one shot. */
unsigned int PathIteration_times;
#ifdef __WORK_STEALING__
/* Work pool with respect to each work group. */
cl_mem work_pool_wgs;
/* Denotes the maximum work groups possible w.r.t. current tile size. */
unsigned int max_work_groups;
#endif
/* clos_max value for which the kernels have been loaded currently. */
int current_max_closure;
/* Marked True in constructor and marked false at the end of path_trace(). */
bool first_tile;
OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_)
: OpenCLDeviceBase(info, stats, background_)
{
background = background_;
/* Initialize cl_mem variables. */
kgbuffer = NULL;
sd = NULL;
sd_DL_shadow = NULL;
rng_coop = NULL;
throughput_coop = NULL;
L_transparent_coop = NULL;
PathRadiance_coop = NULL;
Ray_coop = NULL;
PathState_coop = NULL;
Intersection_coop = NULL;
ray_state = NULL;
AOAlpha_coop = NULL;
AOBSDF_coop = NULL;
AOLightRay_coop = NULL;
BSDFEval_coop = NULL;
ISLamp_coop = NULL;
LightRay_coop = NULL;
Intersection_coop_shadow = NULL;
#ifdef WITH_CYCLES_DEBUG
debugdata_coop = NULL;
#endif
work_array = NULL;
/* Queue. */
Queue_data = NULL;
Queue_index = NULL;
use_queues_flag = NULL;
per_sample_output_buffers = NULL;
per_thread_output_buffer_size = 0;
hostRayStateArray = NULL;
PathIteration_times = PATH_ITER_INC_FACTOR;
#ifdef __WORK_STEALING__
work_pool_wgs = NULL;
max_work_groups = 0;
#endif
current_max_closure = -1;
first_tile = true;
/* Get device's maximum memory that can be allocated. */
ciErr = clGetDeviceInfo(cdDevice,
CL_DEVICE_MAX_MEM_ALLOC_SIZE,
sizeof(size_t),
&total_allocatable_memory,
NULL);
assert(ciErr == CL_SUCCESS);
if(platform_name == "AMD Accelerated Parallel Processing") {
/* This value is tweak-able; AMD platform does not seem to
* give maximum performance when all of CL_DEVICE_MAX_MEM_ALLOC_SIZE
* is considered for further computation.
*/
total_allocatable_memory /= 2;
}
}
/* Split kernel utility functions. */
size_t get_tex_size(const char *tex_name)
{
cl_mem ptr;
size_t ret_size = 0;
MemMap::iterator i = mem_map.find(tex_name);
if(i != mem_map.end()) {
ptr = CL_MEM_PTR(i->second);
ciErr = clGetMemObjectInfo(ptr,
CL_MEM_SIZE,
sizeof(ret_size),
&ret_size,
NULL);
assert(ciErr == CL_SUCCESS);
}
return ret_size;
}
size_t get_shader_data_size(size_t max_closure)
{
/* ShaderData size with variable size ShaderClosure array */
return sizeof(ShaderData) - (sizeof(ShaderClosure) * (MAX_CLOSURE - max_closure));
}
/* Returns size of KernelGlobals structure associated with OpenCL. */
size_t get_KernelGlobals_size()
{
/* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
* fetch its size.
*/
typedef struct KernelGlobals {
ccl_constant KernelData *data;
#define KERNEL_TEX(type, ttype, name) \
ccl_global type *name;
#include "kernel_textures.h"
#undef KERNEL_TEX
void *sd_input;
void *isect_shadow;
} KernelGlobals;
return sizeof(KernelGlobals);
}
virtual void load_kernels(const DeviceRequestedFeatures& requested_features,
vector<OpenCLProgram*> &programs)
{
string build_options = "-D__SPLIT_KERNEL__ ";
#ifdef __WORK_STEALING__
build_options += "-D__WORK_STEALING__ ";
#endif
build_options += requested_features.get_build_options();
/* Set compute device build option. */
cl_device_type device_type;
ciErr = clGetDeviceInfo(cdDevice,
CL_DEVICE_TYPE,
sizeof(cl_device_type),
&device_type,
NULL);
assert(ciErr == CL_SUCCESS);
if(device_type == CL_DEVICE_TYPE_GPU) {
build_options += " -D__COMPUTE_DEVICE_GPU__";
}
#define GLUE(a, b) a ## b
#define LOAD_KERNEL(name) \
do { \
GLUE(program_, name) = OpenCLProgram(this, "split_" #name, "kernel_" #name ".cl", build_options); \
GLUE(program_, name).add_kernel(ustring("path_trace_" #name)); \
programs.push_back(&GLUE(program_, name)); \
} while(false)
LOAD_KERNEL(data_init);
LOAD_KERNEL(scene_intersect);
LOAD_KERNEL(lamp_emission);
LOAD_KERNEL(queue_enqueue);
LOAD_KERNEL(background_buffer_update);
LOAD_KERNEL(shader_eval);
LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao);
LOAD_KERNEL(direct_lighting);
LOAD_KERNEL(shadow_blocked);
LOAD_KERNEL(next_iteration_setup);
LOAD_KERNEL(sum_all_radiance);
#undef FIND_KERNEL
#undef GLUE
current_max_closure = requested_features.max_closure;
}
~OpenCLDeviceSplitKernel()
{
task_pool.stop();
/* Release kernels */
program_data_init.release();
program_scene_intersect.release();
program_lamp_emission.release();
program_queue_enqueue.release();
program_background_buffer_update.release();
program_shader_eval.release();
program_holdout_emission_blurring_pathtermination_ao.release();
program_direct_lighting.release();
program_shadow_blocked.release();
program_next_iteration_setup.release();
program_sum_all_radiance.release();
/* Release global memory */
release_mem_object_safe(rng_coop);
release_mem_object_safe(throughput_coop);
release_mem_object_safe(L_transparent_coop);
release_mem_object_safe(PathRadiance_coop);
release_mem_object_safe(Ray_coop);
release_mem_object_safe(PathState_coop);
release_mem_object_safe(Intersection_coop);
release_mem_object_safe(kgbuffer);
release_mem_object_safe(sd);
release_mem_object_safe(sd_DL_shadow);
release_mem_object_safe(ray_state);
release_mem_object_safe(AOAlpha_coop);
release_mem_object_safe(AOBSDF_coop);
release_mem_object_safe(AOLightRay_coop);
release_mem_object_safe(BSDFEval_coop);
release_mem_object_safe(ISLamp_coop);
release_mem_object_safe(LightRay_coop);
release_mem_object_safe(Intersection_coop_shadow);
#ifdef WITH_CYCLES_DEBUG
release_mem_object_safe(debugdata_coop);
#endif
release_mem_object_safe(use_queues_flag);
release_mem_object_safe(Queue_data);
release_mem_object_safe(Queue_index);
release_mem_object_safe(work_array);
#ifdef __WORK_STEALING__
release_mem_object_safe(work_pool_wgs);
#endif
release_mem_object_safe(per_sample_output_buffers);
if(hostRayStateArray != NULL) {
free(hostRayStateArray);
}
}
void path_trace(DeviceTask *task,
SplitRenderTile& rtile,
int2 max_render_feasible_tile_size)
{
/* cast arguments to cl types */
cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state);
cl_int d_x = rtile.x;
cl_int d_y = rtile.y;
cl_int d_w = rtile.w;
cl_int d_h = rtile.h;
cl_int d_offset = rtile.offset;
cl_int d_stride = rtile.stride;
/* Make sure that set render feasible tile size is a multiple of local
* work size dimensions.
*/
assert(max_render_feasible_tile_size.x % SPLIT_KERNEL_LOCAL_SIZE_X == 0);
assert(max_render_feasible_tile_size.y % SPLIT_KERNEL_LOCAL_SIZE_Y == 0);
size_t global_size[2];
size_t local_size[2] = {SPLIT_KERNEL_LOCAL_SIZE_X,
SPLIT_KERNEL_LOCAL_SIZE_Y};
/* Set the range of samples to be processed for every ray in
* path-regeneration logic.
*/
cl_int start_sample = rtile.start_sample;
cl_int end_sample = rtile.start_sample + rtile.num_samples;
cl_int num_samples = rtile.num_samples;
#ifdef __WORK_STEALING__
global_size[0] = (((d_w - 1) / local_size[0]) + 1) * local_size[0];
global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1];
unsigned int num_parallel_samples = 1;
#else
global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1];
unsigned int num_threads = max_render_feasible_tile_size.x *
max_render_feasible_tile_size.y;
unsigned int num_tile_columns_possible = num_threads / global_size[1];
/* Estimate number of parallel samples that can be
* processed in parallel.
*/
unsigned int num_parallel_samples = min(num_tile_columns_possible / d_w,
rtile.num_samples);
/* Wavefront size in AMD is 64.
* TODO(sergey): What about other platforms?
*/
if(num_parallel_samples >= 64) {
/* TODO(sergey): Could use generic round-up here. */
num_parallel_samples = (num_parallel_samples / 64) * 64;
}
assert(num_parallel_samples != 0);
global_size[0] = d_w * num_parallel_samples;
#endif /* __WORK_STEALING__ */
assert(global_size[0] * global_size[1] <=
max_render_feasible_tile_size.x * max_render_feasible_tile_size.y);
/* Allocate all required global memory once. */
if(first_tile) {
size_t num_global_elements = max_render_feasible_tile_size.x *
max_render_feasible_tile_size.y;
/* TODO(sergey): This will actually over-allocate if
* particular kernel does not support multiclosure.
*/
size_t shaderdata_size = get_shader_data_size(current_max_closure);
#ifdef __WORK_STEALING__
/* Calculate max groups */
size_t max_global_size[2];
size_t tile_x = max_render_feasible_tile_size.x;
size_t tile_y = max_render_feasible_tile_size.y;
max_global_size[0] = (((tile_x - 1) / local_size[0]) + 1) * local_size[0];
max_global_size[1] = (((tile_y - 1) / local_size[1]) + 1) * local_size[1];
max_work_groups = (max_global_size[0] * max_global_size[1]) /
(local_size[0] * local_size[1]);
/* Allocate work_pool_wgs memory. */
work_pool_wgs = mem_alloc(max_work_groups * sizeof(unsigned int));
#endif /* __WORK_STEALING__ */
/* Allocate queue_index memory only once. */
Queue_index = mem_alloc(NUM_QUEUES * sizeof(int));
use_queues_flag = mem_alloc(sizeof(char));
kgbuffer = mem_alloc(get_KernelGlobals_size());
/* Create global buffers for ShaderData. */
sd = mem_alloc(num_global_elements * shaderdata_size);
sd_DL_shadow = mem_alloc(num_global_elements * 2 * shaderdata_size);
/* Creation of global memory buffers which are shared among
* the kernels.
*/
rng_coop = mem_alloc(num_global_elements * sizeof(RNG));
throughput_coop = mem_alloc(num_global_elements * sizeof(float3));
L_transparent_coop = mem_alloc(num_global_elements * sizeof(float));
PathRadiance_coop = mem_alloc(num_global_elements * sizeof(PathRadiance));
Ray_coop = mem_alloc(num_global_elements * sizeof(Ray));
PathState_coop = mem_alloc(num_global_elements * sizeof(PathState));
Intersection_coop = mem_alloc(num_global_elements * sizeof(Intersection));
AOAlpha_coop = mem_alloc(num_global_elements * sizeof(float3));
AOBSDF_coop = mem_alloc(num_global_elements * sizeof(float3));
AOLightRay_coop = mem_alloc(num_global_elements * sizeof(Ray));
BSDFEval_coop = mem_alloc(num_global_elements * sizeof(BsdfEval));
ISLamp_coop = mem_alloc(num_global_elements * sizeof(int));
LightRay_coop = mem_alloc(num_global_elements * sizeof(Ray));
Intersection_coop_shadow = mem_alloc(2 * num_global_elements * sizeof(Intersection));
#ifdef WITH_CYCLES_DEBUG
debugdata_coop = mem_alloc(num_global_elements * sizeof(DebugData));
#endif
ray_state = mem_alloc(num_global_elements * sizeof(char));
hostRayStateArray = (char *)calloc(num_global_elements, sizeof(char));
assert(hostRayStateArray != NULL && "Can't create hostRayStateArray memory");
Queue_data = mem_alloc(num_global_elements * (NUM_QUEUES * sizeof(int)+sizeof(int)));
work_array = mem_alloc(num_global_elements * sizeof(unsigned int));
per_sample_output_buffers = mem_alloc(num_global_elements *
per_thread_output_buffer_size);
}
cl_int dQueue_size = global_size[0] * global_size[1];
cl_uint start_arg_index =
kernel_set_args(program_data_init(),
0,
kgbuffer,
sd_DL_shadow,
d_data,
per_sample_output_buffers,
d_rng_state,
rng_coop,
throughput_coop,
L_transparent_coop,
PathRadiance_coop,
Ray_coop,
PathState_coop,
Intersection_coop_shadow,
ray_state);
/* TODO(sergey): Avoid map lookup here. */
#define KERNEL_TEX(type, ttype, name) \
set_kernel_arg_mem(program_data_init(), &start_arg_index, #name);
#include "kernel_textures.h"
#undef KERNEL_TEX
start_arg_index +=
kernel_set_args(program_data_init(),
start_arg_index,
start_sample,
d_x,
d_y,
d_w,
d_h,
d_offset,
d_stride,
rtile.rng_state_offset_x,
rtile.rng_state_offset_y,
rtile.buffer_rng_state_stride,
Queue_data,
Queue_index,
dQueue_size,
use_queues_flag,
work_array,
#ifdef __WORK_STEALING__
work_pool_wgs,
num_samples,
#endif
#ifdef WITH_CYCLES_DEBUG
debugdata_coop,
#endif
num_parallel_samples);
kernel_set_args(program_scene_intersect(),
0,
kgbuffer,
d_data,
rng_coop,
Ray_coop,
PathState_coop,
Intersection_coop,
ray_state,
d_w,
d_h,
Queue_data,
Queue_index,
dQueue_size,
use_queues_flag,
#ifdef WITH_CYCLES_DEBUG
debugdata_coop,
#endif
num_parallel_samples);
kernel_set_args(program_lamp_emission(),
0,
kgbuffer,
d_data,
throughput_coop,
PathRadiance_coop,
Ray_coop,
PathState_coop,
Intersection_coop,
ray_state,
d_w,
d_h,
Queue_data,
Queue_index,
dQueue_size,
use_queues_flag,
num_parallel_samples);
kernel_set_args(program_queue_enqueue(),
0,
Queue_data,
Queue_index,
ray_state,
dQueue_size);
kernel_set_args(program_background_buffer_update(),
0,
kgbuffer,
d_data,
per_sample_output_buffers,
d_rng_state,
rng_coop,
throughput_coop,
PathRadiance_coop,
Ray_coop,
PathState_coop,
L_transparent_coop,
ray_state,
d_w,
d_h,
d_x,
d_y,
d_stride,
rtile.rng_state_offset_x,
rtile.rng_state_offset_y,
rtile.buffer_rng_state_stride,
work_array,
Queue_data,
Queue_index,
dQueue_size,
end_sample,
start_sample,
#ifdef __WORK_STEALING__
work_pool_wgs,
num_samples,
#endif
#ifdef WITH_CYCLES_DEBUG
debugdata_coop,
#endif
num_parallel_samples);
kernel_set_args(program_shader_eval(),
0,
kgbuffer,
d_data,
sd,
rng_coop,
Ray_coop,
PathState_coop,
Intersection_coop,
ray_state,
Queue_data,
Queue_index,
dQueue_size);
kernel_set_args(program_holdout_emission_blurring_pathtermination_ao(),
0,
kgbuffer,
d_data,
sd,
per_sample_output_buffers,
rng_coop,
throughput_coop,
L_transparent_coop,
PathRadiance_coop,
PathState_coop,
Intersection_coop,
AOAlpha_coop,
AOBSDF_coop,
AOLightRay_coop,
d_w,
d_h,
d_x,
d_y,
d_stride,
ray_state,
work_array,
Queue_data,
Queue_index,
dQueue_size,
#ifdef __WORK_STEALING__
start_sample,
#endif
num_parallel_samples);
kernel_set_args(program_direct_lighting(),
0,
kgbuffer,
d_data,
sd,
rng_coop,
PathState_coop,
ISLamp_coop,
LightRay_coop,
BSDFEval_coop,
ray_state,
Queue_data,
Queue_index,
dQueue_size);
kernel_set_args(program_shadow_blocked(),
0,
kgbuffer,
d_data,
PathState_coop,
LightRay_coop,
AOLightRay_coop,
ray_state,
Queue_data,
Queue_index,
dQueue_size);
kernel_set_args(program_next_iteration_setup(),
0,
kgbuffer,
d_data,
sd,
rng_coop,
throughput_coop,
PathRadiance_coop,
Ray_coop,
PathState_coop,
LightRay_coop,
ISLamp_coop,
BSDFEval_coop,
AOLightRay_coop,
AOBSDF_coop,
AOAlpha_coop,
ray_state,
Queue_data,
Queue_index,
dQueue_size,
use_queues_flag);
kernel_set_args(program_sum_all_radiance(),
0,
d_data,
d_buffer,
per_sample_output_buffers,
num_parallel_samples,
d_w,
d_h,
d_stride,
rtile.buffer_offset_x,
rtile.buffer_offset_y,
rtile.buffer_rng_state_stride,
start_sample);
/* Macro for Enqueuing split kernels. */
#define GLUE(a, b) a ## b
#define ENQUEUE_SPLIT_KERNEL(kernelName, globalSize, localSize) \
{ \
ciErr = clEnqueueNDRangeKernel(cqCommandQueue, \
GLUE(program_, \
kernelName)(), \
2, \
NULL, \
globalSize, \
localSize, \
0, \
NULL, \
NULL); \
opencl_assert_err(ciErr, "clEnqueueNDRangeKernel"); \
if(ciErr != CL_SUCCESS) { \
string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", \
clewErrorString(ciErr)); \
opencl_error(message); \
return; \
} \
} (void) 0
/* Enqueue ckPathTraceKernel_data_init kernel. */
ENQUEUE_SPLIT_KERNEL(data_init, global_size, local_size);
bool activeRaysAvailable = true;
/* Record number of time host intervention has been made */
unsigned int numHostIntervention = 0;
unsigned int numNextPathIterTimes = PathIteration_times;
bool canceled = false;
while(activeRaysAvailable) {
/* Twice the global work size of other kernels for
* ckPathTraceKernel_shadow_blocked_direct_lighting. */
size_t global_size_shadow_blocked[2];
global_size_shadow_blocked[0] = global_size[0] * 2;
global_size_shadow_blocked[1] = global_size[1];
/* Do path-iteration in host [Enqueue Path-iteration kernels. */
for(int PathIter = 0; PathIter < PathIteration_times; PathIter++) {
ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size);
ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size);
ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
ENQUEUE_SPLIT_KERNEL(background_buffer_update, global_size, local_size);
ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size);
ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size);
ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
ENQUEUE_SPLIT_KERNEL(shadow_blocked, global_size_shadow_blocked, local_size);
ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size);
if(task->get_cancel()) {
canceled = true;
break;
}
}
/* Read ray-state into Host memory to decide if we should exit
* path-iteration in host.
*/
ciErr = clEnqueueReadBuffer(cqCommandQueue,
ray_state,
CL_TRUE,
0,
global_size[0] * global_size[1] * sizeof(char),
hostRayStateArray,
0,
NULL,
NULL);
assert(ciErr == CL_SUCCESS);
activeRaysAvailable = false;
for(int rayStateIter = 0;
rayStateIter < global_size[0] * global_size[1];
++rayStateIter)
{
if(int8_t(hostRayStateArray[rayStateIter]) != RAY_INACTIVE) {
/* Not all rays are RAY_INACTIVE. */
activeRaysAvailable = true;
break;
}
}
if(activeRaysAvailable) {
numHostIntervention++;
PathIteration_times = PATH_ITER_INC_FACTOR;
/* Host intervention done before all rays become RAY_INACTIVE;
* Set do more initial iterations for the next tile.
*/
numNextPathIterTimes += PATH_ITER_INC_FACTOR;
}
if(task->get_cancel()) {
canceled = true;
break;
}
}
/* Execute SumALLRadiance kernel to accumulate radiance calculated in
* per_sample_output_buffers into RenderTile's output buffer.
*/
if(!canceled) {
size_t sum_all_radiance_local_size[2] = {16, 16};
size_t sum_all_radiance_global_size[2];
sum_all_radiance_global_size[0] =
(((d_w - 1) / sum_all_radiance_local_size[0]) + 1) *
sum_all_radiance_local_size[0];
sum_all_radiance_global_size[1] =
(((d_h - 1) / sum_all_radiance_local_size[1]) + 1) *
sum_all_radiance_local_size[1];
ENQUEUE_SPLIT_KERNEL(sum_all_radiance,
sum_all_radiance_global_size,
sum_all_radiance_local_size);
}
#undef ENQUEUE_SPLIT_KERNEL
#undef GLUE
if(numHostIntervention == 0) {
/* This means that we are executing kernel more than required
* Must avoid this for the next sample/tile.
*/
PathIteration_times = ((numNextPathIterTimes - PATH_ITER_INC_FACTOR) <= 0) ?
PATH_ITER_INC_FACTOR : numNextPathIterTimes - PATH_ITER_INC_FACTOR;
}
else {
/* Number of path-iterations done for this tile is set as
* Initial path-iteration times for the next tile
*/
PathIteration_times = numNextPathIterTimes;
}
first_tile = false;
}
/* Calculates the amount of memory that has to be always
* allocated in order for the split kernel to function.
* This memory is tile/scene-property invariant (meaning,
* the value returned by this function does not depend
* on the user set tile size or scene properties.
*/
size_t get_invariable_mem_allocated()
{
size_t total_invariable_mem_allocated = 0;
size_t KernelGlobals_size = 0;
KernelGlobals_size = get_KernelGlobals_size();
total_invariable_mem_allocated += KernelGlobals_size; /* KernelGlobals size */
total_invariable_mem_allocated += NUM_QUEUES * sizeof(unsigned int); /* Queue index size */
total_invariable_mem_allocated += sizeof(char); /* use_queues_flag size */
return total_invariable_mem_allocated;
}
/* Calculate the memory that has-to-be/has-been allocated for
* the split kernel to function.
*/
size_t get_tile_specific_mem_allocated(const int2 tile_size)
{
size_t tile_specific_mem_allocated = 0;
/* Get required tile info */
unsigned int user_set_tile_w = tile_size.x;
unsigned int user_set_tile_h = tile_size.y;
#ifdef __WORK_STEALING__
/* Calculate memory to be allocated for work_pools in
* case of work_stealing.
*/
size_t max_global_size[2];
size_t max_num_work_pools = 0;
max_global_size[0] =
(((user_set_tile_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
SPLIT_KERNEL_LOCAL_SIZE_X;
max_global_size[1] =
(((user_set_tile_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
SPLIT_KERNEL_LOCAL_SIZE_Y;
max_num_work_pools =
(max_global_size[0] * max_global_size[1]) /
(SPLIT_KERNEL_LOCAL_SIZE_X * SPLIT_KERNEL_LOCAL_SIZE_Y);
tile_specific_mem_allocated += max_num_work_pools * sizeof(unsigned int);
#endif
tile_specific_mem_allocated +=
user_set_tile_w * user_set_tile_h * per_thread_output_buffer_size;
tile_specific_mem_allocated +=
user_set_tile_w * user_set_tile_h * sizeof(RNG);
return tile_specific_mem_allocated;
}
/* Calculates the texture memories and KernelData (d_data) memory
* that has been allocated.
*/
size_t get_scene_specific_mem_allocated(cl_mem d_data)
{
size_t scene_specific_mem_allocated = 0;
/* Calculate texture memories. */
#define KERNEL_TEX(type, ttype, name) \
scene_specific_mem_allocated += get_tex_size(#name);
#include "kernel_textures.h"
#undef KERNEL_TEX
size_t d_data_size;
ciErr = clGetMemObjectInfo(d_data,
CL_MEM_SIZE,
sizeof(d_data_size),
&d_data_size,
NULL);
assert(ciErr == CL_SUCCESS && "Can't get d_data mem object info");
scene_specific_mem_allocated += d_data_size;
return scene_specific_mem_allocated;
}
/* Calculate the memory required for one thread in split kernel. */
size_t get_per_thread_memory()
{
size_t shaderdata_size = 0;
/* TODO(sergey): This will actually over-allocate if
* particular kernel does not support multiclosure.
*/
shaderdata_size = get_shader_data_size(current_max_closure);
size_t retval = sizeof(RNG)
+ sizeof(float3) /* Throughput size */
+ sizeof(float) /* L transparent size */
+ sizeof(char) /* Ray state size */
+ sizeof(unsigned int) /* Work element size */
+ sizeof(int) /* ISLamp_size */
+ sizeof(PathRadiance) + sizeof(Ray) + sizeof(PathState)
+ sizeof(Intersection) /* Overall isect */
+ sizeof(Intersection) /* Instersection_coop_AO */
+ sizeof(Intersection) /* Intersection coop DL */
+ shaderdata_size /* Overall ShaderData */
+ (shaderdata_size * 2) /* ShaderData : DL and shadow */
+ sizeof(Ray) + sizeof(BsdfEval)
+ sizeof(float3) /* AOAlpha size */
+ sizeof(float3) /* AOBSDF size */
+ sizeof(Ray)
+ (sizeof(int) * NUM_QUEUES)
+ per_thread_output_buffer_size;
return retval;
}
/* Considers the total memory available in the device and
* and returns the maximum global work size possible.
*/
size_t get_feasible_global_work_size(int2 tile_size, cl_mem d_data)
{
/* Calculate invariably allocated memory. */
size_t invariable_mem_allocated = get_invariable_mem_allocated();
/* Calculate tile specific allocated memory. */
size_t tile_specific_mem_allocated =
get_tile_specific_mem_allocated(tile_size);
/* Calculate scene specific allocated memory. */
size_t scene_specific_mem_allocated =
get_scene_specific_mem_allocated(d_data);
/* Calculate total memory available for the threads in global work size. */
size_t available_memory = total_allocatable_memory
- invariable_mem_allocated
- tile_specific_mem_allocated
- scene_specific_mem_allocated
- DATA_ALLOCATION_MEM_FACTOR;
size_t per_thread_memory_required = get_per_thread_memory();
return (available_memory / per_thread_memory_required);
}
/* Checks if the device has enough memory to render the whole tile;
* If not, we should split single tile into multiple tiles of small size
* and process them all.
*/
bool need_to_split_tile(unsigned int d_w,
unsigned int d_h,
int2 max_render_feasible_tile_size)
{
size_t global_size_estimate[2];
/* TODO(sergey): Such round-ups are in quite few places, need to replace
* them with an utility macro.
*/
global_size_estimate[0] =
(((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
SPLIT_KERNEL_LOCAL_SIZE_X;
global_size_estimate[1] =
(((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
SPLIT_KERNEL_LOCAL_SIZE_Y;
if((global_size_estimate[0] * global_size_estimate[1]) >
(max_render_feasible_tile_size.x * max_render_feasible_tile_size.y))
{
return true;
}
else {
return false;
}
}
/* Considers the scene properties, global memory available in the device
* and returns a rectanglular tile dimension (approx the maximum)
* that should render on split kernel.
*/
int2 get_max_render_feasible_tile_size(size_t feasible_global_work_size)
{
int2 max_render_feasible_tile_size;
int square_root_val = (int)sqrt(feasible_global_work_size);
max_render_feasible_tile_size.x = square_root_val;
max_render_feasible_tile_size.y = square_root_val;
/* Ciel round-off max_render_feasible_tile_size. */
int2 ceil_render_feasible_tile_size;
ceil_render_feasible_tile_size.x =
(((max_render_feasible_tile_size.x - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
SPLIT_KERNEL_LOCAL_SIZE_X;
ceil_render_feasible_tile_size.y =
(((max_render_feasible_tile_size.y - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
SPLIT_KERNEL_LOCAL_SIZE_Y;
if(ceil_render_feasible_tile_size.x * ceil_render_feasible_tile_size.y <=
feasible_global_work_size)
{
return ceil_render_feasible_tile_size;
}
/* Floor round-off max_render_feasible_tile_size. */
int2 floor_render_feasible_tile_size;
floor_render_feasible_tile_size.x =
(max_render_feasible_tile_size.x / SPLIT_KERNEL_LOCAL_SIZE_X) *
SPLIT_KERNEL_LOCAL_SIZE_X;
floor_render_feasible_tile_size.y =
(max_render_feasible_tile_size.y / SPLIT_KERNEL_LOCAL_SIZE_Y) *
SPLIT_KERNEL_LOCAL_SIZE_Y;
return floor_render_feasible_tile_size;
}
/* Try splitting the current tile into multiple smaller
* almost-square-tiles.
*/
int2 get_split_tile_size(RenderTile rtile,
int2 max_render_feasible_tile_size)
{
int2 split_tile_size;
int num_global_threads = max_render_feasible_tile_size.x *
max_render_feasible_tile_size.y;
int d_w = rtile.w;
int d_h = rtile.h;
/* Ceil round off d_w and d_h */
d_w = (((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
SPLIT_KERNEL_LOCAL_SIZE_X;
d_h = (((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
SPLIT_KERNEL_LOCAL_SIZE_Y;
while(d_w * d_h > num_global_threads) {
/* Halve the longer dimension. */
if(d_w >= d_h) {
d_w = d_w / 2;
d_w = (((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
SPLIT_KERNEL_LOCAL_SIZE_X;
}
else {
d_h = d_h / 2;
d_h = (((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
SPLIT_KERNEL_LOCAL_SIZE_Y;
}
}
split_tile_size.x = d_w;
split_tile_size.y = d_h;
return split_tile_size;
}
/* Splits existing tile into multiple tiles of tile size split_tile_size. */
vector<SplitRenderTile> split_tiles(RenderTile rtile, int2 split_tile_size)
{
vector<SplitRenderTile> to_path_trace_rtile;
int d_w = rtile.w;
int d_h = rtile.h;
int num_tiles_x = (((d_w - 1) / split_tile_size.x) + 1);
int num_tiles_y = (((d_h - 1) / split_tile_size.y) + 1);
/* Buffer and rng_state offset calc. */
size_t offset_index = rtile.offset + (rtile.x + rtile.y * rtile.stride);
size_t offset_x = offset_index % rtile.stride;
size_t offset_y = offset_index / rtile.stride;
/* Resize to_path_trace_rtile. */
to_path_trace_rtile.resize(num_tiles_x * num_tiles_y);
for(int tile_iter_y = 0; tile_iter_y < num_tiles_y; tile_iter_y++) {
for(int tile_iter_x = 0; tile_iter_x < num_tiles_x; tile_iter_x++) {
int rtile_index = tile_iter_y * num_tiles_x + tile_iter_x;
to_path_trace_rtile[rtile_index].rng_state_offset_x = offset_x + tile_iter_x * split_tile_size.x;
to_path_trace_rtile[rtile_index].rng_state_offset_y = offset_y + tile_iter_y * split_tile_size.y;
to_path_trace_rtile[rtile_index].buffer_offset_x = offset_x + tile_iter_x * split_tile_size.x;
to_path_trace_rtile[rtile_index].buffer_offset_y = offset_y + tile_iter_y * split_tile_size.y;
to_path_trace_rtile[rtile_index].start_sample = rtile.start_sample;
to_path_trace_rtile[rtile_index].num_samples = rtile.num_samples;
to_path_trace_rtile[rtile_index].sample = rtile.sample;
to_path_trace_rtile[rtile_index].resolution = rtile.resolution;
to_path_trace_rtile[rtile_index].offset = rtile.offset;
to_path_trace_rtile[rtile_index].buffers = rtile.buffers;
to_path_trace_rtile[rtile_index].buffer = rtile.buffer;
to_path_trace_rtile[rtile_index].rng_state = rtile.rng_state;
to_path_trace_rtile[rtile_index].x = rtile.x + (tile_iter_x * split_tile_size.x);
to_path_trace_rtile[rtile_index].y = rtile.y + (tile_iter_y * split_tile_size.y);
to_path_trace_rtile[rtile_index].buffer_rng_state_stride = rtile.stride;
/* Fill width and height of the new render tile. */
to_path_trace_rtile[rtile_index].w = (tile_iter_x == (num_tiles_x - 1)) ?
(d_w - (tile_iter_x * split_tile_size.x)) /* Border tile */
: split_tile_size.x;
to_path_trace_rtile[rtile_index].h = (tile_iter_y == (num_tiles_y - 1)) ?
(d_h - (tile_iter_y * split_tile_size.y)) /* Border tile */
: split_tile_size.y;
to_path_trace_rtile[rtile_index].stride = to_path_trace_rtile[rtile_index].w;
}
}
return to_path_trace_rtile;
}
void thread_run(DeviceTask *task)
{
if(task->type == DeviceTask::FILM_CONVERT) {
film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
}
else if(task->type == DeviceTask::SHADER) {
shader(*task);
}
else if(task->type == DeviceTask::PATH_TRACE) {
RenderTile tile;
bool initialize_data_and_check_render_feasibility = false;
bool need_to_split_tiles_further = false;
int2 max_render_feasible_tile_size;
size_t feasible_global_work_size;
const int2 tile_size = task->requested_tile_size;
/* Keep rendering tiles until done. */
while(task->acquire_tile(this, tile)) {
if(!initialize_data_and_check_render_feasibility) {
/* Initialize data. */
/* Calculate per_thread_output_buffer_size. */
size_t output_buffer_size = 0;
ciErr = clGetMemObjectInfo((cl_mem)tile.buffer,
CL_MEM_SIZE,
sizeof(output_buffer_size),
&output_buffer_size,
NULL);
assert(ciErr == CL_SUCCESS && "Can't get tile.buffer mem object info");
/* This value is different when running on AMD and NV. */
if(background) {
/* In offline render the number of buffer elements
* associated with tile.buffer is the current tile size.
*/
per_thread_output_buffer_size =
output_buffer_size / (tile.w * tile.h);
}
else {
/* interactive rendering, unlike offline render, the number of buffer elements
* associated with tile.buffer is the entire viewport size.
*/
per_thread_output_buffer_size =
output_buffer_size / (tile.buffers->params.width *
tile.buffers->params.height);
}
/* Check render feasibility. */
feasible_global_work_size = get_feasible_global_work_size(
tile_size,
CL_MEM_PTR(const_mem_map["__data"]->device_pointer));
max_render_feasible_tile_size =
get_max_render_feasible_tile_size(
feasible_global_work_size);
need_to_split_tiles_further =
need_to_split_tile(tile_size.x,
tile_size.y,
max_render_feasible_tile_size);
initialize_data_and_check_render_feasibility = true;
}
if(need_to_split_tiles_further) {
int2 split_tile_size =
get_split_tile_size(tile,
max_render_feasible_tile_size);
vector<SplitRenderTile> to_path_trace_render_tiles =
split_tiles(tile, split_tile_size);
/* Print message to console */
if(background && (to_path_trace_render_tiles.size() > 1)) {
fprintf(stderr, "Message : Tiles need to be split "
"further inside path trace (due to insufficient "
"device-global-memory for split kernel to "
"function) \n"
"The current tile of dimensions %dx%d is split "
"into tiles of dimension %dx%d for render \n",
tile.w, tile.h,
split_tile_size.x,
split_tile_size.y);
}
/* Process all split tiles. */
for(int tile_iter = 0;
tile_iter < to_path_trace_render_tiles.size();
++tile_iter)
{
path_trace(task,
to_path_trace_render_tiles[tile_iter],
max_render_feasible_tile_size);
}
}
else {
/* No splitting required; process the entire tile at once. */
/* Render feasible tile size is user-set-tile-size itself. */
max_render_feasible_tile_size.x =
(((tile_size.x - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
SPLIT_KERNEL_LOCAL_SIZE_X;
max_render_feasible_tile_size.y =
(((tile_size.y - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
SPLIT_KERNEL_LOCAL_SIZE_Y;
/* buffer_rng_state_stride is stride itself. */
SplitRenderTile split_tile(tile);
split_tile.buffer_rng_state_stride = tile.stride;
path_trace(task, split_tile, max_render_feasible_tile_size);
}
tile.sample = tile.start_sample + tile.num_samples;
/* Complete kernel execution before release tile. */
/* This helps in multi-device render;
* The device that reaches the critical-section function
* release_tile waits (stalling other devices from entering
* release_tile) for all kernels to complete. If device1 (a
* slow-render device) reaches release_tile first then it would
* stall device2 (a fast-render device) from proceeding to render
* next tile.
*/
clFinish(cqCommandQueue);
task->release_tile(tile);
}
}
}
protected:
cl_mem mem_alloc(size_t bufsize, cl_mem_flags mem_flag = CL_MEM_READ_WRITE)
{
cl_mem ptr;
assert(bufsize != 0);
ptr = clCreateBuffer(cxContext, mem_flag, bufsize, NULL, &ciErr);
opencl_assert_err(ciErr, "clCreateBuffer");
return ptr;
}
/* ** Those guys are for workign around some compiler-specific bugs ** */
string build_options_for_base_program(
const DeviceRequestedFeatures& requested_features)
{
return requested_features.get_build_options();
}
};
Device *opencl_create_split_device(DeviceInfo& info, Stats& stats, bool background)
{
return new OpenCLDeviceSplitKernel(info, stats, background);
}
CCL_NAMESPACE_END
#endif /* WITH_OPENCL */