Cycles: Add an AVX kernel for CPU rendering.

* AVX is available on Intel Sandy Bridge and newer and AMD Bulldozer and newer.
* We don't use dedicated AVX intrinsics yet, but gcc auto vectorization gives a 3% performance improvement for Caminandes. Tested on an i5-3570, Linux x64.
* No change for Windows yet, MSVC 2008 does not support AVX.

Reviewed by: brecht
Differential Revision: https://developer.blender.org/D216
This commit is contained in:
Thomas Dinges
2014-01-16 17:04:11 +01:00
parent 7c6d52eb07
commit de28a4d4b2
9 changed files with 180 additions and 1 deletions

View File

@@ -61,6 +61,7 @@ public:
system_cpu_support_sse2();
system_cpu_support_sse3();
system_cpu_support_sse41();
system_cpu_support_avx();
}
~CPUDevice()
@@ -166,6 +167,28 @@ public:
int start_sample = tile.start_sample;
int end_sample = tile.start_sample + tile.num_samples;
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
if(system_cpu_support_avx()) {
for(int sample = start_sample; sample < end_sample; sample++) {
if (task.get_cancel() || task_pool.canceled()) {
if(task.need_finish_queue == false)
break;
}
for(int y = tile.y; y < tile.y + tile.h; y++) {
for(int x = tile.x; x < tile.x + tile.w; x++) {
kernel_cpu_avx_path_trace(&kg, render_buffer, rng_state,
sample, x, y, tile.offset, tile.stride);
}
}
tile.sample = sample + 1;
task.update_progress(tile);
}
}
else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
if(system_cpu_support_sse41()) {
for(int sample = start_sample; sample < end_sample; sample++) {
@@ -270,6 +293,15 @@ public:
float sample_scale = 1.0f/(task.sample + 1);
if(task.rgba_half) {
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
if(system_cpu_support_avx()) {
for(int y = task.y; y < task.y + task.h; y++)
for(int x = task.x; x < task.x + task.w; x++)
kernel_cpu_avx_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
sample_scale, x, y, task.offset, task.stride);
}
else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
if(system_cpu_support_sse41()) {
for(int y = task.y; y < task.y + task.h; y++)
@@ -305,6 +337,15 @@ public:
}
}
else {
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
if(system_cpu_support_avx()) {
for(int y = task.y; y < task.y + task.h; y++)
for(int x = task.x; x < task.x + task.w; x++)
kernel_cpu_avx_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
sample_scale, x, y, task.offset, task.stride);
}
else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
if(system_cpu_support_sse41()) {
for(int y = task.y; y < task.y + task.h; y++)
@@ -349,6 +390,17 @@ public:
OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
if(system_cpu_support_avx()) {
for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
kernel_cpu_avx_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
if(task_pool.canceled())
break;
}
}
else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
if(system_cpu_support_sse41()) {
for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {