diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index edee32e6506..9200473825d 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -253,7 +253,6 @@ public: return false; } - return true; } @@ -315,17 +314,6 @@ public: string kernel = path_join(kernel_path, "kernel.cu"); string include = kernel_path; const int machine = system_cpu_bits(); - string arch_flags; - - /* CUDA 5.x build flags for different archs */ - if(major == 2) { - /* sm_2x */ - arch_flags = "--maxrregcount=40 --use_fast_math"; - } - else if(major == 3) { - /* sm_3x */ - arch_flags = "--maxrregcount=32 --use_fast_math"; - } double starttime = time_dt(); printf("Compiling CUDA kernel ...\n"); @@ -333,8 +321,8 @@ public: path_create_directories(cubin); string command = string_printf("\"%s\" -arch=sm_%d%d -m%d --cubin \"%s\" " - "-o \"%s\" --ptxas-options=\"-v\" %s -I\"%s\" -DNVCC -D__KERNEL_CUDA_VERSION__=%d", - nvcc.c_str(), major, minor, machine, kernel.c_str(), cubin.c_str(), arch_flags.c_str(), include.c_str(), cuda_version); + "-o \"%s\" --ptxas-options=\"-v\" -I\"%s\" -DNVCC -D__KERNEL_CUDA_VERSION__=%d", + nvcc.c_str(), major, minor, machine, kernel.c_str(), cubin.c_str(), include.c_str(), cuda_version); printf("%s\n", command.c_str()); @@ -665,9 +653,18 @@ public: cuda_assert(cuParamSetSize(cuPathTrace, offset)) - /* launch kernel: todo find optimal size, cache config for fermi */ - int xthreads = 16; - int ythreads = 16; + /* launch kernel */ + int threads_per_block; + cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuPathTrace)) + + /*int num_registers; + cuda_assert(cuFuncGetAttribute(&num_registers, CU_FUNC_ATTRIBUTE_NUM_REGS, cuPathTrace)) + + printf("threads_per_block %d\n", threads_per_block); + printf("num_registers %d\n", num_registers);*/ + + int xthreads = (int)sqrt(threads_per_block); + int ythreads = (int)sqrt(threads_per_block); int xblocks = (rtile.w + xthreads - 1)/xthreads; int yblocks = (rtile.h + ythreads - 1)/ythreads; @@ -730,9 +727,12 @@ public: cuda_assert(cuParamSetSize(cuFilmConvert, offset)) - /* launch kernel: todo find optimal size, cache config for fermi */ - int xthreads = 16; - int ythreads = 16; + /* launch kernel */ + int threads_per_block; + cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilmConvert)) + + int xthreads = (int)sqrt(threads_per_block); + int ythreads = (int)sqrt(threads_per_block); int xblocks = (task.w + xthreads - 1)/xthreads; int yblocks = (task.h + ythreads - 1)/ythreads; @@ -752,40 +752,42 @@ public: cuda_push_context(); - CUfunction cuDisplace; + CUfunction cuShader; CUdeviceptr d_input = cuda_device_ptr(task.shader_input); CUdeviceptr d_output = cuda_device_ptr(task.shader_output); /* get kernel function */ - cuda_assert(cuModuleGetFunction(&cuDisplace, cuModule, "kernel_cuda_shader")) + cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_shader")) /* pass in parameters */ int offset = 0; - cuda_assert(cuParamSetv(cuDisplace, offset, &d_input, sizeof(d_input))) + cuda_assert(cuParamSetv(cuShader, offset, &d_input, sizeof(d_input))) offset += sizeof(d_input); - cuda_assert(cuParamSetv(cuDisplace, offset, &d_output, sizeof(d_output))) + cuda_assert(cuParamSetv(cuShader, offset, &d_output, sizeof(d_output))) offset += sizeof(d_output); int shader_eval_type = task.shader_eval_type; offset = align_up(offset, __alignof(shader_eval_type)); - cuda_assert(cuParamSeti(cuDisplace, offset, task.shader_eval_type)) + cuda_assert(cuParamSeti(cuShader, offset, task.shader_eval_type)) offset += sizeof(task.shader_eval_type); - cuda_assert(cuParamSeti(cuDisplace, offset, task.shader_x)) + cuda_assert(cuParamSeti(cuShader, offset, task.shader_x)) offset += sizeof(task.shader_x); - cuda_assert(cuParamSetSize(cuDisplace, offset)) + cuda_assert(cuParamSetSize(cuShader, offset)) - /* launch kernel: todo find optimal size, cache config for fermi */ - int xthreads = 16; - int xblocks = (task.shader_w + xthreads - 1)/xthreads; + /* launch kernel */ + int threads_per_block; + cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader)) - cuda_assert(cuFuncSetCacheConfig(cuDisplace, CU_FUNC_CACHE_PREFER_L1)) - cuda_assert(cuFuncSetBlockShape(cuDisplace, xthreads, 1, 1)) - cuda_assert(cuLaunchGrid(cuDisplace, xblocks, 1)) + int xblocks = (task.shader_w + threads_per_block - 1)/threads_per_block; + + cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1)) + cuda_assert(cuFuncSetBlockShape(cuShader, threads_per_block, 1, 1)) + cuda_assert(cuLaunchGrid(cuShader, xblocks, 1)) cuda_pop_context(); } diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 7dab65f4dd2..1527d154c86 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -161,16 +161,6 @@ if(WITH_CYCLES_CUDA_BINARIES) set(cuda_cubin kernel_${arch}.cubin) set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${CUDA_VERSION}") - - # CUDA 5.x build flags for different archs - if(${arch} MATCHES "sm_2[0-9]") - # sm_2x - set(cuda_arch_flags "--maxrregcount=40") - elseif(${arch} MATCHES "sm_3[0-9]") - # sm_3x - set(cuda_arch_flags "--maxrregcount=32") - endif() - set(cuda_math_flags "--use_fast_math") if(CUDA_VERSION LESS 50 AND ${arch} MATCHES "sm_35") diff --git a/intern/cycles/kernel/SConscript b/intern/cycles/kernel/SConscript index 61ddaff59a5..347835e6ef9 100644 --- a/intern/cycles/kernel/SConscript +++ b/intern/cycles/kernel/SConscript @@ -87,14 +87,6 @@ if env['WITH_BF_CYCLES_CUDA_BINARIES']: for arch in cuda_archs: cubin_file = os.path.join(build_dir, "kernel_%s.cubin" % arch) - # CUDA 5.x build flags for different archs - if arch.startswith("sm_2"): - # sm_2x - cuda_arch_flags = "--maxrregcount=40 --use_fast_math" - elif arch.startswith("sm_3"): - # sm_3x - cuda_arch_flags = "--maxrregcount=32 --use_fast_math" - if env['BF_CYCLES_CUDA_ENV']: MS_SDK = "C:\\Program Files\\Microsoft SDKs\\Windows\\v7.1\\Bin\\SetEnv.cmd" command = "\"%s\" & \"%s\" -arch=%s %s %s \"%s\" -o \"%s\"" % (MS_SDK, nvcc, arch, nvcc_flags, cuda_arch_flags, kernel_file, cubin_file) diff --git a/intern/cycles/kernel/kernel.cu b/intern/cycles/kernel/kernel.cu index 5e6748c66fc..ade72715f61 100644 --- a/intern/cycles/kernel/kernel.cu +++ b/intern/cycles/kernel/kernel.cu @@ -24,7 +24,71 @@ #include "kernel_path.h" #include "kernel_displace.h" -extern "C" __global__ void kernel_cuda_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride) +/* device data taken from CUDA occupancy calculator */ + +#ifdef __CUDA_ARCH__ + +/* 2.0 and 2.1 */ +#if __CUDA_ARCH__ == 200 || __CUDA_ARCH__ == 210 +#define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768 +#define CUDA_MULTIPROCESSOR_MAX_BLOCKS 8 +#define CUDA_BLOCK_MAX_THREADS 1024 +#define CUDA_THREAD_MAX_REGISTERS 63 + +/* tunable parameters */ +#define CUDA_THREADS_BLOCK_WIDTH 16 +#define CUDA_KERNEL_MAX_REGISTERS 32 +#define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40 + +/* 3.0 and 3.5 */ +#elif __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350 +#define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 +#define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16 +#define CUDA_BLOCK_MAX_THREADS 1024 +#define CUDA_THREAD_MAX_REGISTERS 63 + +/* tunable parameters */ +#define CUDA_THREADS_BLOCK_WIDTH 16 +#define CUDA_KERNEL_MAX_REGISTERS 32 +#define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40 + +/* unknown architecture */ +#else +#error "Unknown or unuspported CUDA architecture, can't determine launch bounds" +#endif + +/* compute number of threads per block and minimum blocks per multiprocessor + * given the maximum number of registers per thread */ + +#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \ + __launch_bounds__( \ + threads_block_width*threads_block_width, \ + CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \ + ) + +/* sanity checks */ + +#if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS +#error "Maximum number of threads per block exceeded" +#endif + +#if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS +#error "Maximum number of blocks per multiprocessor exceeded" +#endif + +#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS +#error "Maximum number of registers per thread exceeded" +#endif + +#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS +#error "Maximum number of registers per thread exceeded" +#endif + +/* kernels */ + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride) { int x = sx + blockDim.x*blockIdx.x + threadIdx.x; int y = sy + blockDim.y*blockIdx.y + threadIdx.y; @@ -34,7 +98,9 @@ extern "C" __global__ void kernel_cuda_path_trace(float *buffer, uint *rng_state } #ifdef __BRANCHED_PATH__ -extern "C" __global__ void kernel_cuda_branched_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride) +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_BRANCHED_MAX_REGISTERS) +kernel_cuda_branched_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride) { int x = sx + blockDim.x*blockIdx.x + threadIdx.x; int y = sy + blockDim.y*blockIdx.y + threadIdx.y; @@ -44,7 +110,9 @@ extern "C" __global__ void kernel_cuda_branched_path_trace(float *buffer, uint * } #endif -extern "C" __global__ void kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride) +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride) { int x = sx + blockDim.x*blockIdx.x + threadIdx.x; int y = sy + blockDim.y*blockIdx.y + threadIdx.y; @@ -53,7 +121,9 @@ extern "C" __global__ void kernel_cuda_convert_to_byte(uchar4 *rgba, float *buff kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride); } -extern "C" __global__ void kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride) +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride) { int x = sx + blockDim.x*blockIdx.x + threadIdx.x; int y = sy + blockDim.y*blockIdx.y + threadIdx.y; @@ -62,10 +132,14 @@ extern "C" __global__ void kernel_cuda_convert_to_half_float(uchar4 *rgba, float kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride); } -extern "C" __global__ void kernel_cuda_shader(uint4 *input, float4 *output, int type, int sx) +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_shader(uint4 *input, float4 *output, int type, int sx) { int x = sx + blockDim.x*blockIdx.x + threadIdx.x; kernel_shader_evaluate(NULL, input, output, (ShaderEvalType)type, x); } +#endif +