diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index edee32e6506..9200473825d 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -253,7 +253,6 @@ public:
 			return false;
 		}
 		
-
 		return true;
 	}
 
@@ -315,17 +314,6 @@ public:
 		string kernel = path_join(kernel_path, "kernel.cu");
 		string include = kernel_path;
 		const int machine = system_cpu_bits();
-		string arch_flags;
-
-		/* CUDA 5.x build flags for different archs */
-		if(major == 2) {
-			/* sm_2x */
-			arch_flags = "--maxrregcount=40 --use_fast_math";
-		}
-		else if(major == 3) {
-			/* sm_3x */
-			arch_flags = "--maxrregcount=32 --use_fast_math";
-		}
 
 		double starttime = time_dt();
 		printf("Compiling CUDA kernel ...\n");
@@ -333,8 +321,8 @@ public:
 		path_create_directories(cubin);
 
 		string command = string_printf("\"%s\" -arch=sm_%d%d -m%d --cubin \"%s\" "
-			"-o \"%s\" --ptxas-options=\"-v\" %s -I\"%s\" -DNVCC -D__KERNEL_CUDA_VERSION__=%d",
-			nvcc.c_str(), major, minor, machine, kernel.c_str(), cubin.c_str(), arch_flags.c_str(), include.c_str(), cuda_version);
+			"-o \"%s\" --ptxas-options=\"-v\" -I\"%s\" -DNVCC -D__KERNEL_CUDA_VERSION__=%d",
+			nvcc.c_str(), major, minor, machine, kernel.c_str(), cubin.c_str(), include.c_str(), cuda_version);
 
 		printf("%s\n", command.c_str());
 
@@ -665,9 +653,18 @@ public:
 
 		cuda_assert(cuParamSetSize(cuPathTrace, offset))
 
-		/* launch kernel: todo find optimal size, cache config for fermi */
-		int xthreads = 16;
-		int ythreads = 16;
+		/* launch kernel */
+		int threads_per_block;
+		cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuPathTrace))
+
+		/*int num_registers;
+		cuda_assert(cuFuncGetAttribute(&num_registers, CU_FUNC_ATTRIBUTE_NUM_REGS, cuPathTrace))
+
+		printf("threads_per_block %d\n", threads_per_block);
+		printf("num_registers %d\n", num_registers);*/
+
+		int xthreads = (int)sqrt(threads_per_block);
+		int ythreads = (int)sqrt(threads_per_block);
 		int xblocks = (rtile.w + xthreads - 1)/xthreads;
 		int yblocks = (rtile.h + ythreads - 1)/ythreads;
 
@@ -730,9 +727,12 @@ public:
 
 		cuda_assert(cuParamSetSize(cuFilmConvert, offset))
 
-		/* launch kernel: todo find optimal size, cache config for fermi */
-		int xthreads = 16;
-		int ythreads = 16;
+		/* launch kernel */
+		int threads_per_block;
+		cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilmConvert))
+
+		int xthreads = (int)sqrt(threads_per_block);
+		int ythreads = (int)sqrt(threads_per_block);
 		int xblocks = (task.w + xthreads - 1)/xthreads;
 		int yblocks = (task.h + ythreads - 1)/ythreads;
 
@@ -752,40 +752,42 @@ public:
 
 		cuda_push_context();
 
-		CUfunction cuDisplace;
+		CUfunction cuShader;
 		CUdeviceptr d_input = cuda_device_ptr(task.shader_input);
 		CUdeviceptr d_output = cuda_device_ptr(task.shader_output);
 
 		/* get kernel function */
-		cuda_assert(cuModuleGetFunction(&cuDisplace, cuModule, "kernel_cuda_shader"))
+		cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_shader"))
 		
 		/* pass in parameters */
 		int offset = 0;
 		
-		cuda_assert(cuParamSetv(cuDisplace, offset, &d_input, sizeof(d_input)))
+		cuda_assert(cuParamSetv(cuShader, offset, &d_input, sizeof(d_input)))
 		offset += sizeof(d_input);
 
-		cuda_assert(cuParamSetv(cuDisplace, offset, &d_output, sizeof(d_output)))
+		cuda_assert(cuParamSetv(cuShader, offset, &d_output, sizeof(d_output)))
 		offset += sizeof(d_output);
 
 		int shader_eval_type = task.shader_eval_type;
 		offset = align_up(offset, __alignof(shader_eval_type));
 
-		cuda_assert(cuParamSeti(cuDisplace, offset, task.shader_eval_type))
+		cuda_assert(cuParamSeti(cuShader, offset, task.shader_eval_type))
 		offset += sizeof(task.shader_eval_type);
 
-		cuda_assert(cuParamSeti(cuDisplace, offset, task.shader_x))
+		cuda_assert(cuParamSeti(cuShader, offset, task.shader_x))
 		offset += sizeof(task.shader_x);
 
-		cuda_assert(cuParamSetSize(cuDisplace, offset))
+		cuda_assert(cuParamSetSize(cuShader, offset))
 
-		/* launch kernel: todo find optimal size, cache config for fermi */
-		int xthreads = 16;
-		int xblocks = (task.shader_w + xthreads - 1)/xthreads;
+		/* launch kernel */
+		int threads_per_block;
+		cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader))
 
-		cuda_assert(cuFuncSetCacheConfig(cuDisplace, CU_FUNC_CACHE_PREFER_L1))
-		cuda_assert(cuFuncSetBlockShape(cuDisplace, xthreads, 1, 1))
-		cuda_assert(cuLaunchGrid(cuDisplace, xblocks, 1))
+		int xblocks = (task.shader_w + threads_per_block - 1)/threads_per_block;
+
+		cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1))
+		cuda_assert(cuFuncSetBlockShape(cuShader, threads_per_block, 1, 1))
+		cuda_assert(cuLaunchGrid(cuShader, xblocks, 1))
 
 		cuda_pop_context();
 	}
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 7dab65f4dd2..1527d154c86 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -161,16 +161,6 @@ if(WITH_CYCLES_CUDA_BINARIES)
 		set(cuda_cubin kernel_${arch}.cubin)
 
 		set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${CUDA_VERSION}")
-
-		# CUDA 5.x build flags for different archs
-		if(${arch} MATCHES "sm_2[0-9]")
-			# sm_2x
-			set(cuda_arch_flags "--maxrregcount=40")
-		elseif(${arch} MATCHES "sm_3[0-9]")
-			# sm_3x
-			set(cuda_arch_flags "--maxrregcount=32")
-		endif()
-
 		set(cuda_math_flags "--use_fast_math")
 		
 		if(CUDA_VERSION LESS 50 AND ${arch} MATCHES "sm_35")
diff --git a/intern/cycles/kernel/SConscript b/intern/cycles/kernel/SConscript
index 61ddaff59a5..347835e6ef9 100644
--- a/intern/cycles/kernel/SConscript
+++ b/intern/cycles/kernel/SConscript
@@ -87,14 +87,6 @@ if env['WITH_BF_CYCLES_CUDA_BINARIES']:
     for arch in cuda_archs:
         cubin_file = os.path.join(build_dir, "kernel_%s.cubin" % arch)
 
-        # CUDA 5.x build flags for different archs
-        if arch.startswith("sm_2"):
-            # sm_2x
-            cuda_arch_flags = "--maxrregcount=40 --use_fast_math"
-        elif arch.startswith("sm_3"):
-            # sm_3x
-            cuda_arch_flags = "--maxrregcount=32 --use_fast_math"
-
         if env['BF_CYCLES_CUDA_ENV']:
             MS_SDK = "C:\\Program Files\\Microsoft SDKs\\Windows\\v7.1\\Bin\\SetEnv.cmd"
             command = "\"%s\" & \"%s\" -arch=%s %s %s \"%s\" -o \"%s\"" % (MS_SDK, nvcc, arch, nvcc_flags, cuda_arch_flags, kernel_file, cubin_file)
diff --git a/intern/cycles/kernel/kernel.cu b/intern/cycles/kernel/kernel.cu
index 5e6748c66fc..ade72715f61 100644
--- a/intern/cycles/kernel/kernel.cu
+++ b/intern/cycles/kernel/kernel.cu
@@ -24,7 +24,71 @@
 #include "kernel_path.h"
 #include "kernel_displace.h"
 
-extern "C" __global__ void kernel_cuda_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
+/* device data taken from CUDA occupancy calculator */
+
+#ifdef __CUDA_ARCH__
+
+/* 2.0 and 2.1 */
+#if __CUDA_ARCH__ == 200 || __CUDA_ARCH__ == 210
+#define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
+#define CUDA_MULTIPROCESSOR_MAX_BLOCKS 8
+#define CUDA_BLOCK_MAX_THREADS 1024
+#define CUDA_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+#define CUDA_THREADS_BLOCK_WIDTH 16
+#define CUDA_KERNEL_MAX_REGISTERS 32
+#define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40
+
+/* 3.0 and 3.5 */
+#elif __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
+#define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
+#define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
+#define CUDA_BLOCK_MAX_THREADS 1024
+#define CUDA_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+#define CUDA_THREADS_BLOCK_WIDTH 16
+#define CUDA_KERNEL_MAX_REGISTERS 32
+#define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40
+
+/* unknown architecture */
+#else
+#error "Unknown or unuspported CUDA architecture, can't determine launch bounds"
+#endif
+
+/* compute number of threads per block and minimum blocks per multiprocessor
+ * given the maximum number of registers per thread */
+
+#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
+	__launch_bounds__( \
+		threads_block_width*threads_block_width, \
+		CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \
+		)
+
+/* sanity checks */
+
+#if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS
+#error "Maximum number of threads per block exceeded"
+#endif
+
+#if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS
+#error "Maximum number of blocks per multiprocessor exceeded"
+#endif
+
+#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
+#error "Maximum number of registers per thread exceeded"
+#endif
+
+#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
+#error "Maximum number of registers per thread exceeded"
+#endif
+
+/* kernels */
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
 {
 	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
 	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
@@ -34,7 +98,9 @@ extern "C" __global__ void kernel_cuda_path_trace(float *buffer, uint *rng_state
 }
 
 #ifdef __BRANCHED_PATH__
-extern "C" __global__ void kernel_cuda_branched_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_BRANCHED_MAX_REGISTERS)
+kernel_cuda_branched_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
 {
 	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
 	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
@@ -44,7 +110,9 @@ extern "C" __global__ void kernel_cuda_branched_path_trace(float *buffer, uint *
 }
 #endif
 
-extern "C" __global__ void kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
 {
 	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
 	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
@@ -53,7 +121,9 @@ extern "C" __global__ void kernel_cuda_convert_to_byte(uchar4 *rgba, float *buff
 		kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
 }
 
-extern "C" __global__ void kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
 {
 	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
 	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
@@ -62,10 +132,14 @@ extern "C" __global__ void kernel_cuda_convert_to_half_float(uchar4 *rgba, float
 		kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
 }
 
-extern "C" __global__ void kernel_cuda_shader(uint4 *input, float4 *output, int type, int sx)
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_shader(uint4 *input, float4 *output, int type, int sx)
 {
 	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
 
 	kernel_shader_evaluate(NULL, input, output, (ShaderEvalType)type, x);
 }
 
+#endif
+