Cycles: Add an AVX kernel for CPU rendering.
* AVX is available on Intel Sandy Bridge and newer and AMD Bulldozer and newer. * We don't use dedicated AVX intrinsics yet, but gcc auto vectorization gives a 3% performance improvement for Caminandes. Tested on an i5-3570, Linux x64. * No change for Windows yet, MSVC 2008 does not support AVX. Reviewed by: brecht Differential Revision: https://developer.blender.org/D216
This commit is contained in:
@@ -61,6 +61,7 @@ public:
|
||||
system_cpu_support_sse2();
|
||||
system_cpu_support_sse3();
|
||||
system_cpu_support_sse41();
|
||||
system_cpu_support_avx();
|
||||
}
|
||||
|
||||
~CPUDevice()
|
||||
@@ -166,6 +167,28 @@ public:
|
||||
int start_sample = tile.start_sample;
|
||||
int end_sample = tile.start_sample + tile.num_samples;
|
||||
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
|
||||
if(system_cpu_support_avx()) {
|
||||
for(int sample = start_sample; sample < end_sample; sample++) {
|
||||
if (task.get_cancel() || task_pool.canceled()) {
|
||||
if(task.need_finish_queue == false)
|
||||
break;
|
||||
}
|
||||
|
||||
for(int y = tile.y; y < tile.y + tile.h; y++) {
|
||||
for(int x = tile.x; x < tile.x + tile.w; x++) {
|
||||
kernel_cpu_avx_path_trace(&kg, render_buffer, rng_state,
|
||||
sample, x, y, tile.offset, tile.stride);
|
||||
}
|
||||
}
|
||||
|
||||
tile.sample = sample + 1;
|
||||
|
||||
task.update_progress(tile);
|
||||
}
|
||||
}
|
||||
else
|
||||
#endif
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
|
||||
if(system_cpu_support_sse41()) {
|
||||
for(int sample = start_sample; sample < end_sample; sample++) {
|
||||
@@ -270,6 +293,15 @@ public:
|
||||
float sample_scale = 1.0f/(task.sample + 1);
|
||||
|
||||
if(task.rgba_half) {
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
|
||||
if(system_cpu_support_avx()) {
|
||||
for(int y = task.y; y < task.y + task.h; y++)
|
||||
for(int x = task.x; x < task.x + task.w; x++)
|
||||
kernel_cpu_avx_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
|
||||
sample_scale, x, y, task.offset, task.stride);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
|
||||
if(system_cpu_support_sse41()) {
|
||||
for(int y = task.y; y < task.y + task.h; y++)
|
||||
@@ -305,6 +337,15 @@ public:
|
||||
}
|
||||
}
|
||||
else {
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
|
||||
if(system_cpu_support_avx()) {
|
||||
for(int y = task.y; y < task.y + task.h; y++)
|
||||
for(int x = task.x; x < task.x + task.w; x++)
|
||||
kernel_cpu_avx_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
|
||||
sample_scale, x, y, task.offset, task.stride);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
|
||||
if(system_cpu_support_sse41()) {
|
||||
for(int y = task.y; y < task.y + task.h; y++)
|
||||
@@ -349,6 +390,17 @@ public:
|
||||
OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
|
||||
#endif
|
||||
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
|
||||
if(system_cpu_support_avx()) {
|
||||
for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
|
||||
kernel_cpu_avx_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
|
||||
|
||||
if(task_pool.canceled())
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
#endif
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
|
||||
if(system_cpu_support_sse41()) {
|
||||
for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
|
||||
|
Reference in New Issue
Block a user