Fix #29259: cycles issues on certain processors. Now two versions of the kernel

are compiled, one SSE optimized and the other not, and it will choose between
them at runtime.
This commit is contained in:
Brecht Van Lommel
2011-11-15 15:13:38 +00:00
parent 2bc7821913
commit db8024f4b5
8 changed files with 226 additions and 45 deletions

View File

@@ -9,31 +9,18 @@ include(cmake/external_libs.cmake)
# Build Flags
if(WITH_RAYOPTIMIZATION AND SUPPORT_SSE_BUILD)
set(GCC_OPTIM_FLAGS "-ffast-math -msse -msse2 -msse3")
endif()
set(WITH_CYCLES_OPTIMIZED_KERNEL ON)
if(APPLE)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GCC_OPTIM_FLAGS}")
set(RTTI_DISABLE_FLAGS "-fno-rtti -DBOOST_NO_RTTI -DBOOST_NO_TYPEID")
endif()
if(WIN32)
if(MSVC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Ox /Ot /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /EHsc /fp:fast")
set(RTTI_DISABLE_FLAGS "/GR- -DBOOST_NO_RTTI -DBOOST_NO_TYPEID")
if(WIN32 AND MSVC)
set(CYCLES_OPTIMIZED_KERNEL_FLAGS "/Ox /Ot /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /EHsc /fp:fast")
elseif(CMAKE_COMPILER_IS_GNUCC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GCC_OPTIM_FLAGS}")
set(RTTI_DISABLE_FLAGS "-fno-rtti -DBOOST_NO_RTTI -DBOOST_NO_TYPEID")
set(CYCLES_OPTIMIZED_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -DGOGOGO")
endif()
endif()
if(UNIX AND NOT APPLE)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GCC_OPTIM_FLAGS}")
set(RTTI_DISABLE_FLAGS "-fno-rtti -DBOOST_NO_RTTI -DBOOST_NO_TYPEID")
endif()
# not needed yet, is for open shading language
set(RTTI_DISABLE_FLAGS "")
# for OSL, not needed yet
# set(RTTI_DISABLE_FLAGS "-fno-rtti -DBOOST_NO_RTTI -DBOOST_NO_TYPEID")
# set(RTTI_DISABLE_FLAGS "/GR- -DBOOST_NO_RTTI -DBOOST_NO_TYPEID")
# Definitions and Includes
@@ -42,6 +29,10 @@ add_definitions(${BOOST_DEFINITIONS} ${OPENIMAGEIO_DEFINITIONS})
add_definitions(-DCCL_NAMESPACE_BEGIN=namespace\ ccl\ {)
add_definitions(-DCCL_NAMESPACE_END=})
if(WITH_CYCLES_OPTIMIZED_KERNEL)
add_definitions(-DWITH_OPTIMIZED_KERNEL)
endif()
if(WITH_CYCLES_NETWORK)
add_definitions(-DWITH_NETWORK)
endif()

View File

@@ -10,11 +10,10 @@ sources = cycles.Glob('bvh/*.cpp') + cycles.Glob('device/*.cpp') + cycles.Glob('
sources.remove(path.join('util', 'util_view.cpp'))
sources.remove(path.join('render', 'film_response.cpp'))
sources.remove(path.join('kernel', 'kernel_optimized.cpp'))
incs = []
defs = []
ccflags = []
cxxflags = []
defs.append('CCL_NAMESPACE_BEGIN=namespace ccl {')
defs.append('CCL_NAMESPACE_END=}')
@@ -23,14 +22,6 @@ defs.append('WITH_OPENCL')
defs.append('WITH_MULTI')
defs.append('WITH_CUDA')
if env['OURPLATFORM'] in ('win32-mingw'):
if env['WITH_BF_RAYOPTIMIZATION']:
cxxflags.append('-ffast-math -msse -msse2 -msse3'.split())
ccflags.append('-ffast-math -msse -msse2 -msse3'.split())
# not needed yet, is for open shading language
# cxxflags.append('-fno-rtti'.split())
# defs.append('BOOST_NO_RTTI BOOST_NO_TYPEID'.split())
incs.extend('. bvh render device kernel kernel/osl kernel/svm util subd'.split())
incs.extend('#intern/guardedalloc #source/blender/makesrna #source/blender/makesdna'.split())
incs.extend('#source/blender/blenloader ../../source/blender/makesrna/intern'.split())
@@ -39,5 +30,20 @@ incs.append(cycles['BF_OIIO_INC'])
incs.append(cycles['BF_BOOST_INC'])
incs.append(cycles['BF_PYTHON_INC'])
cycles.BlenderLib('bf_intern_cycles', sources, incs, defs, libtype=['intern'], priority=[0], compileflags=[None], cc_compileflags=ccflags, cxx_compileflags=cxxflags)
# optimized kernel
if env['WITH_BF_RAYOPTIMIZATION']:
optim_cxxflags = []
if env['OURPLATFORM'] in ('win32-vc', 'win64-vc'):
optim_cxxflags.append('/Ox /Ot /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /EHsc /fp:fast'.split())
else:
optim_cxxflags.append('-ffast-math -msse -msse2 -msse3'.split())
optim_defs = defs + ['WITH_OPTIMIZED_KERNEL']
optim_sources = [path.join('kernel', 'kernel_optimized.cpp')]
cycles_optim = cycles.Clone()
cycles_optim.BlenderLib('bf_intern_cycles_optimized', optim_sources, incs, optim_defs, libtype=['intern'], priority=[0], compileflags=[None], cxx_compileflags=optim_cxxflags)
cycles.BlenderLib('bf_intern_cycles', sources, incs, defs, libtype=['intern'], priority=[0], compileflags=[None])

View File

@@ -48,6 +48,9 @@ public:
{
kg = kernel_globals_create();
/* do now to avoid thread issues */
system_cpu_support_optimized();
if(threads_num == 0)
threads_num = system_cpu_thread_count();
@@ -155,12 +158,26 @@ public:
OSLShader::thread_init(kg);
#endif
for(int y = task.y; y < task.y + task.h; y++) {
for(int x = task.x; x < task.x + task.w; x++)
kernel_cpu_path_trace(kg, (float4*)task.buffer, (unsigned int*)task.rng_state, task.sample, x, y);
#ifdef WITH_OPTIMIZED_KERNEL
if(system_cpu_support_optimized()) {
for(int y = task.y; y < task.y + task.h; y++) {
for(int x = task.x; x < task.x + task.w; x++)
kernel_cpu_optimized_path_trace(kg, (float4*)task.buffer, (unsigned int*)task.rng_state, task.sample, x, y);
if(tasks.worker_cancel())
break;
if(tasks.worker_cancel())
break;
}
}
else
#endif
{
for(int y = task.y; y < task.y + task.h; y++) {
for(int x = task.x; x < task.x + task.w; x++)
kernel_cpu_path_trace(kg, (float4*)task.buffer, (unsigned int*)task.rng_state, task.sample, x, y);
if(tasks.worker_cancel())
break;
}
}
#ifdef WITH_OSL
@@ -171,9 +188,18 @@ public:
void thread_tonemap(DeviceTask& task)
{
for(int y = task.y; y < task.y + task.h; y++) {
for(int x = task.x; x < task.x + task.w; x++)
kernel_cpu_tonemap(kg, (uchar4*)task.rgba, (float4*)task.buffer, task.sample, task.resolution, x, y);
#ifdef WITH_OPTIMIZED_KERNEL
if(system_cpu_support_optimized()) {
for(int y = task.y; y < task.y + task.h; y++)
for(int x = task.x; x < task.x + task.w; x++)
kernel_cpu_optimized_tonemap(kg, (uchar4*)task.rgba, (float4*)task.buffer, task.sample, task.resolution, x, y);
}
else
#endif
{
for(int y = task.y; y < task.y + task.h; y++)
for(int x = task.x; x < task.x + task.w; x++)
kernel_cpu_tonemap(kg, (uchar4*)task.rgba, (float4*)task.buffer, task.sample, task.resolution, x, y);
}
}
@@ -184,11 +210,24 @@ public:
OSLShader::thread_init(kg);
#endif
for(int x = task.displace_x; x < task.displace_x + task.displace_w; x++) {
kernel_cpu_displace(kg, (uint4*)task.displace_input, (float3*)task.displace_offset, x);
#ifdef WITH_OPTIMIZED_KERNEL
if(system_cpu_support_optimized()) {
for(int x = task.displace_x; x < task.displace_x + task.displace_w; x++) {
kernel_cpu_optimized_displace(kg, (uint4*)task.displace_input, (float3*)task.displace_offset, x);
if(tasks.worker_cancel())
break;
if(tasks.worker_cancel())
break;
}
}
else
#endif
{
for(int x = task.displace_x; x < task.displace_x + task.displace_w; x++) {
kernel_cpu_displace(kg, (uint4*)task.displace_input, (float3*)task.displace_offset, x);
if(tasks.worker_cancel())
break;
}
}
#ifdef WITH_OSL

View File

@@ -8,6 +8,7 @@ set(INC
set(SRC
kernel.cpp
kernel_optimized.cpp
kernel.cl
kernel.cu
)
@@ -123,11 +124,15 @@ include_directories(${INC})
add_library(cycles_kernel ${SRC} ${SRC_HEADERS} ${SRC_SVM_HEADERS})
if(WITH_CYCLES_OPTIMIZED_KERNEL)
SET_SOURCE_FILES_PROPERTIES(kernel_optimized.cpp PROPERTIES COMPILE_FLAGS ${CYCLES_OPTIMIZED_KERNEL_FLAGS})
endif()
if(WITH_CYCLES_CUDA)
add_dependencies(cycles_kernel cycles_kernel_cuda)
endif()
# OPENCL kernel
# OpenCL kernel
#set(KERNEL_PREPROCESSED ${CMAKE_CURRENT_BINARY_DIR}/kernel_preprocessed.cl)
#add_custom_command(
@@ -142,3 +147,4 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel.cu" ${CYCLES_INSTALL_PATH}/k
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/svm)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)

View File

@@ -38,9 +38,14 @@ void kernel_tex_copy(KernelGlobals *kg, const char *name, device_ptr mem, size_t
void kernel_cpu_path_trace(KernelGlobals *kg, float4 *buffer, unsigned int *rng_state, int sample, int x, int y);
void kernel_cpu_tonemap(KernelGlobals *kg, uchar4 *rgba, float4 *buffer, int sample, int resolution, int x, int y);
void kernel_cpu_displace(KernelGlobals *kg, uint4 *input, float3 *offset, int i);
#ifdef WITH_OPTIMIZED_KERNEL
void kernel_cpu_optimized_path_trace(KernelGlobals *kg, float4 *buffer, unsigned int *rng_state, int sample, int x, int y);
void kernel_cpu_optimized_tonemap(KernelGlobals *kg, uchar4 *rgba, float4 *buffer, int sample, int resolution, int x, int y);
void kernel_cpu_optimized_displace(KernelGlobals *kg, uint4 *input, float3 *offset, int i);
#endif
CCL_NAMESPACE_END
#endif /* __KERNEL_H__ */

View File

@@ -0,0 +1,60 @@
/*
* Copyright 2011, Blender Foundation.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software Foundation,
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
/* Optimized CPU kernel entry points. This file is compiled with SSE3
optimization flags and nearly all functions inlined, while kernel.cpp
is compiled without for other CPU's. */
#ifdef WITH_OPTIMIZED_KERNEL
#include "kernel.h"
#include "kernel_compat_cpu.h"
#include "kernel_math.h"
#include "kernel_types.h"
#include "kernel_globals.h"
#include "kernel_film.h"
#include "kernel_path.h"
#include "kernel_displace.h"
CCL_NAMESPACE_BEGIN
/* Path Tracing */
void kernel_cpu_optimized_path_trace(KernelGlobals *kg, float4 *buffer, unsigned int *rng_state, int sample, int x, int y)
{
kernel_path_trace(kg, buffer, rng_state, sample, x, y);
}
/* Tonemapping */
void kernel_cpu_optimized_tonemap(KernelGlobals *kg, uchar4 *rgba, float4 *buffer, int sample, int resolution, int x, int y)
{
kernel_film_tonemap(kg, rgba, buffer, sample, resolution, x, y);
}
/* Displacement */
void kernel_cpu_optimized_displace(KernelGlobals *kg, uint4 *input, float3 *offset, int i)
{
kernel_displace(kg, input, offset, i);
}
CCL_NAMESPACE_END
#endif

View File

@@ -118,5 +118,78 @@ int system_cpu_bits()
return (sizeof(void*)*8);
}
#if defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(_M_IX86)
struct CPUCapabilities {
bool x64;
bool mmx;
bool sse;
bool sse2;
bool sse3;
bool ssse3;
bool sse41;
bool sse42;
bool sse4a;
bool avx;
bool xop;
bool fma3;
bool fma4;
};
bool system_cpu_support_optimized()
{
static CPUCapabilities caps;
static bool caps_init = false;
if(!caps_init) {
int result[4], num, num_ex;
memset(&caps, 0, sizeof(caps));
__cpuid(result, 0);
num = result[0];
__cpuid(result, 0x80000000);
num_ex = result[0];
if(num >= 1){
__cpuid(result, 0x00000001);
caps.mmx = (result[3] & ((int)1 << 23)) != 0;
caps.sse = (result[3] & ((int)1 << 25)) != 0;
caps.sse2 = (result[3] & ((int)1 << 26)) != 0;
caps.sse3 = (result[2] & ((int)1 << 0)) != 0;
caps.ssse3 = (result[2] & ((int)1 << 9)) != 0;
caps.sse41 = (result[2] & ((int)1 << 19)) != 0;
caps.sse42 = (result[2] & ((int)1 << 20)) != 0;
caps.avx = (result[2] & ((int)1 << 28)) != 0;
caps.fma3 = (result[2] & ((int)1 << 12)) != 0;
}
/*if(num_ex >= 0x80000001){
__cpuid(result, 0x80000001);
caps.x64 = (result[3] & ((int)1 << 29)) != 0;
caps.sse4a = (result[2] & ((int)1 << 6)) != 0;
caps.fma4 = (result[2] & ((int)1 << 16)) != 0;
caps.xop = (result[2] & ((int)1 << 11)) != 0;
}*/
caps_init = true;
}
/* optimization flags use these */
return caps.sse && caps.sse2 && caps.sse3;
}
#else
bool system_cpu_support_optimized()
{
return false;
}
#endif
CCL_NAMESPACE_END

View File

@@ -26,6 +26,7 @@ CCL_NAMESPACE_BEGIN
int system_cpu_thread_count();
string system_cpu_brand_string();
int system_cpu_bits();
bool system_cpu_support_optimized();
CCL_NAMESPACE_END