Cycles: CUDA split performance tweaks, still far from megakernel.
On Pabellon, 25.8s mega, 35.4s split before, 32.7s split after.
This commit is contained in:
@@ -1898,17 +1898,13 @@ public:
|
|||||||
int threads_per_block;
|
int threads_per_block;
|
||||||
cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
|
cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
|
||||||
|
|
||||||
int xthreads = (int)sqrt(threads_per_block);
|
int xblocks = (dim.global_size[0]*dim.global_size[1] + threads_per_block - 1)/threads_per_block;
|
||||||
int ythreads = (int)sqrt(threads_per_block);
|
|
||||||
|
|
||||||
int xblocks = (dim.global_size[0] + xthreads - 1)/xthreads;
|
|
||||||
int yblocks = (dim.global_size[1] + ythreads - 1)/ythreads;
|
|
||||||
|
|
||||||
cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1));
|
cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1));
|
||||||
|
|
||||||
cuda_assert(cuLaunchKernel(func,
|
cuda_assert(cuLaunchKernel(func,
|
||||||
xblocks , yblocks, 1, /* blocks */
|
xblocks, 1, 1, /* blocks */
|
||||||
xthreads, ythreads, 1, /* threads */
|
threads_per_block, 1, 1, /* threads */
|
||||||
0, 0, args, 0));
|
0, 0, args, 0));
|
||||||
|
|
||||||
device->cuda_pop_context();
|
device->cuda_pop_context();
|
||||||
|
@@ -81,8 +81,13 @@
|
|||||||
# error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
|
# error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* compute number of threads per block and minimum blocks per multiprocessor
|
/* For split kernel using all registers seems fastest for now, but this
|
||||||
* given the maximum number of registers per thread */
|
* is unlikely to be optimal once we resolve other bottlenecks. */
|
||||||
|
|
||||||
|
#define CUDA_KERNEL_SPLIT_MAX_REGISTERS CUDA_THREAD_MAX_REGISTERS
|
||||||
|
|
||||||
|
/* Compute number of threads per block and minimum blocks per multiprocessor
|
||||||
|
* given the maximum number of registers per thread. */
|
||||||
|
|
||||||
#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
|
#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
|
||||||
__launch_bounds__( \
|
__launch_bounds__( \
|
||||||
|
@@ -90,7 +90,7 @@ kernel_cuda_path_trace_data_init(
|
|||||||
|
|
||||||
#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
|
#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
|
||||||
extern "C" __global__ void \
|
extern "C" __global__ void \
|
||||||
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) \
|
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \
|
||||||
kernel_cuda_##name() \
|
kernel_cuda_##name() \
|
||||||
{ \
|
{ \
|
||||||
kernel_##name(NULL); \
|
kernel_##name(NULL); \
|
||||||
@@ -98,7 +98,7 @@ kernel_cuda_path_trace_data_init(
|
|||||||
|
|
||||||
#define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
|
#define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
|
||||||
extern "C" __global__ void \
|
extern "C" __global__ void \
|
||||||
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) \
|
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \
|
||||||
kernel_cuda_##name() \
|
kernel_cuda_##name() \
|
||||||
{ \
|
{ \
|
||||||
ccl_local type locals; \
|
ccl_local type locals; \
|
||||||
|
Reference in New Issue
Block a user