Merge branch 'master' into 28
This commit is contained in:
@@ -1613,10 +1613,23 @@ int2 CUDASplitKernel::split_kernel_local_size()
|
||||
return make_int2(32, 1);
|
||||
}
|
||||
|
||||
int2 CUDASplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memory& /*data*/, DeviceTask * /*task*/)
|
||||
int2 CUDASplitKernel::split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask * /*task*/)
|
||||
{
|
||||
/* TODO(mai): implement something here to detect ideal work size */
|
||||
return make_int2(256, 256);
|
||||
size_t free;
|
||||
size_t total;
|
||||
|
||||
device->cuda_push_context();
|
||||
cuda_assert(cuMemGetInfo(&free, &total));
|
||||
device->cuda_pop_context();
|
||||
|
||||
VLOG(1) << "Maximum device allocation size: "
|
||||
<< string_human_readable_number(free) << " bytes. ("
|
||||
<< string_human_readable_size(free) << ").";
|
||||
|
||||
size_t num_elements = max_elements_for_max_buffer_size(kg, data, free / 2);
|
||||
int2 global_size = make_int2(round_down((int)sqrt(num_elements), 32), (int)sqrt(num_elements));
|
||||
VLOG(1) << "Global size: " << global_size << ".";
|
||||
return global_size;
|
||||
}
|
||||
|
||||
bool device_cuda_init(void)
|
||||
|
@@ -128,26 +128,27 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
|
||||
local_size[1] = lsize[1];
|
||||
}
|
||||
|
||||
/* Set gloabl size */
|
||||
size_t global_size[2];
|
||||
{
|
||||
int2 gsize = split_kernel_global_size(kgbuffer, kernel_data, task);
|
||||
|
||||
/* Make sure that set work size is a multiple of local
|
||||
* work size dimensions.
|
||||
*/
|
||||
global_size[0] = round_up(gsize[0], local_size[0]);
|
||||
global_size[1] = round_up(gsize[1], local_size[1]);
|
||||
}
|
||||
|
||||
/* Number of elements in the global state buffer */
|
||||
int num_global_elements = global_size[0] * global_size[1];
|
||||
assert(num_global_elements % WORK_POOL_SIZE == 0);
|
||||
|
||||
/* Allocate all required global memory once. */
|
||||
if(first_tile) {
|
||||
first_tile = false;
|
||||
|
||||
/* Set gloabl size */
|
||||
{
|
||||
int2 gsize = split_kernel_global_size(kgbuffer, kernel_data, task);
|
||||
|
||||
/* Make sure that set work size is a multiple of local
|
||||
* work size dimensions.
|
||||
*/
|
||||
global_size[0] = round_up(gsize[0], local_size[0]);
|
||||
global_size[1] = round_up(gsize[1], local_size[1]);
|
||||
}
|
||||
|
||||
num_global_elements = global_size[0] * global_size[1];
|
||||
assert(num_global_elements % WORK_POOL_SIZE == 0);
|
||||
|
||||
/* Calculate max groups */
|
||||
|
||||
/* Denotes the maximum work groups possible w.r.t. current requested tile size. */
|
||||
|
@@ -95,6 +95,9 @@ private:
|
||||
/* Marked True in constructor and marked false at the end of path_trace(). */
|
||||
bool first_tile;
|
||||
|
||||
/* Cached global size */
|
||||
size_t global_size[2];
|
||||
|
||||
public:
|
||||
explicit DeviceSplitKernel(Device* device);
|
||||
virtual ~DeviceSplitKernel();
|
||||
|
Reference in New Issue
Block a user