Cycles: prepare to make CUDA 5.0 the official version we use

* Add CUDA compiler version detection to cmake/scons/runtime * Remove noinline in kernel_shader.h and reenable --use_fast_math if CUDA 5.x is used, these were workarounds for CUDA 4.2 bugs * Change max number of registers to 32 for sm 2.x (based on performance tests from Martijn Berger and confirmed here), and also for NVidia OpenCL. Overall it seems that with these changes and the latest CUDA 5.0 download, that performance is as good as or better than the 2.67b release with the scenes and graphics cards I tested.
2013-06-19 17:54:23 +00:00
parent a7416641e6
commit 16204bd647
8 changed files with 209 additions and 22 deletions
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -271,11 +271,53 @@ public:
 			return "";
 		}

+		int cuda_version = cuCompilerVersion();
+
+		if(cuda_version == 0) {
+			cuda_error_message("CUDA nvcc compiler version could not be parsed.");
+			return "";
+		}
+
+		if(cuda_version != 50)
+			printf("CUDA version %d.%d detected, build may succeed but only CUDA 5.0 is officially supported.\n", cuda_version/10, cuda_version%10);
+
 		/* compile */
 		string kernel = path_join(kernel_path, "kernel.cu");
 		string include = kernel_path;
 		const int machine = system_cpu_bits();
-		const int maxreg = 24;
+		string arch_flags;
+
+		/* build flags depending on CUDA version and arch */
+		if(cuda_version < 50) {
+			/* CUDA 4.x */
+			if(major == 1) {
+				/* sm_1x */
+				arch_flags = "--maxrregcount=24 --opencc-options -OPT:Olimit=0";
+			}
+			else if(major == 2) {
+				/* sm_2x */
+				arch_flags = "--maxrregcount=24";
+			}
+			else {
+				/* sm_3x */
+				arch_flags = "--maxrregcount=32";
+			}
+		}
+		else {
+			/* CUDA 4.x */
+			if(major == 1) {
+				/* sm_1x */
+				arch_flags = "--maxrregcount=24 --opencc-options -OPT:Olimit=0 --use_fast_math";
+			}
+			else if(major == 2) {
+				/* sm_2x */
+				arch_flags = "--maxrregcount=32 --use_fast_math";
+			}
+			else {
+				/* sm_3x */
+				arch_flags = "--maxrregcount=32 --use_fast_math";
+			}
+		}

 		double starttime = time_dt();
 		printf("Compiling CUDA kernel ...\n");
@@ -283,8 +325,10 @@ public:
 		path_create_directories(cubin);

 		string command = string_printf("\"%s\" -arch=sm_%d%d -m%d --cubin \"%s\" "
-			"-o \"%s\" --ptxas-options=\"-v\" --maxrregcount=%d --opencc-options -OPT:Olimit=0 -I\"%s\" -DNVCC",
-			nvcc.c_str(), major, minor, machine, kernel.c_str(), cubin.c_str(), maxreg, include.c_str());
+			"-o \"%s\" --ptxas-options=\"-v\" %s -I\"%s\" -DNVCC -D__KERNEL_CUDA_VERSION__=%d",
+			nvcc.c_str(), major, minor, machine, kernel.c_str(), cubin.c_str(), arch_flags.c_str(), include.c_str(), cuda_version);
+
+		printf("%s\n", command.c_str());

 		if(system(command.c_str()) == -1) {
 			cuda_error_message("Failed to execute compilation command, see console for details.");