Cycles: Add an AVX kernel for CPU rendering.

* AVX is available on Intel Sandy Bridge and newer and AMD Bulldozer and newer. * We don't use dedicated AVX intrinsics yet, but gcc auto vectorization gives a 3% performance improvement for Caminandes. Tested on an i5-3570, Linux x64. * No change for Windows yet, MSVC 2008 does not support AVX. Reviewed by: brecht Differential Revision: https://developer.blender.org/D216
2014-01-16 17:04:11 +01:00
parent 7c6d52eb07
commit de28a4d4b2
9 changed files with 180 additions and 1 deletions
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -61,6 +61,7 @@ public:
 		system_cpu_support_sse2();
 		system_cpu_support_sse3();
 		system_cpu_support_sse41();
+		system_cpu_support_avx();
 	}

 	~CPUDevice()
@@ -166,6 +167,28 @@ public:
 			int start_sample = tile.start_sample;
 			int end_sample = tile.start_sample + tile.num_samples;

+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+			if(system_cpu_support_avx()) {
+				for(int sample = start_sample; sample < end_sample; sample++) {
+					if (task.get_cancel() || task_pool.canceled()) {
+						if(task.need_finish_queue == false)
+							break;
+					}
+
+					for(int y = tile.y; y < tile.y + tile.h; y++) {
+						for(int x = tile.x; x < tile.x + tile.w; x++) {
+							kernel_cpu_avx_path_trace(&kg, render_buffer, rng_state,
+								sample, x, y, tile.offset, tile.stride);
+						}
+					}
+
+					tile.sample = sample + 1;
+
+					task.update_progress(tile);
+				}
+			}
+			else
+#endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41			
 			if(system_cpu_support_sse41()) {
 				for(int sample = start_sample; sample < end_sample; sample++) {
@@ -270,6 +293,15 @@ public:
 		float sample_scale = 1.0f/(task.sample + 1);

 		if(task.rgba_half) {
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+			if(system_cpu_support_avx()) {
+				for(int y = task.y; y < task.y + task.h; y++)
+					for(int x = task.x; x < task.x + task.w; x++)
+						kernel_cpu_avx_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
+							sample_scale, x, y, task.offset, task.stride);
+			}
+			else
+#endif	
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41			
 			if(system_cpu_support_sse41()) {
 				for(int y = task.y; y < task.y + task.h; y++)
@@ -305,6 +337,15 @@ public:
 			}
 		}
 		else {
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+			if(system_cpu_support_avx()) {
+				for(int y = task.y; y < task.y + task.h; y++)
+					for(int x = task.x; x < task.x + task.w; x++)
+						kernel_cpu_avx_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
+							sample_scale, x, y, task.offset, task.stride);
+			}
+			else
+#endif		
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41			
 			if(system_cpu_support_sse41()) {
 				for(int y = task.y; y < task.y + task.h; y++)
@@ -349,6 +390,17 @@ public:
 		OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
 #endif

+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+		if(system_cpu_support_avx()) {
+			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
+				kernel_cpu_avx_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
+
+				if(task_pool.canceled())
+					break;
+			}
+		}
+		else
+#endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41			
 		if(system_cpu_support_sse41()) {
 			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {