Fix #29259: cycles issues on certain processors. Now two versions of the kernel
are compiled, one SSE optimized and the other not, and it will choose between them at runtime.
This commit is contained in:
@@ -9,31 +9,18 @@ include(cmake/external_libs.cmake)
|
||||
# Build Flags
|
||||
|
||||
if(WITH_RAYOPTIMIZATION AND SUPPORT_SSE_BUILD)
|
||||
set(GCC_OPTIM_FLAGS "-ffast-math -msse -msse2 -msse3")
|
||||
endif()
|
||||
set(WITH_CYCLES_OPTIMIZED_KERNEL ON)
|
||||
|
||||
if(APPLE)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GCC_OPTIM_FLAGS}")
|
||||
set(RTTI_DISABLE_FLAGS "-fno-rtti -DBOOST_NO_RTTI -DBOOST_NO_TYPEID")
|
||||
endif()
|
||||
|
||||
if(WIN32)
|
||||
if(MSVC)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Ox /Ot /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /EHsc /fp:fast")
|
||||
set(RTTI_DISABLE_FLAGS "/GR- -DBOOST_NO_RTTI -DBOOST_NO_TYPEID")
|
||||
if(WIN32 AND MSVC)
|
||||
set(CYCLES_OPTIMIZED_KERNEL_FLAGS "/Ox /Ot /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /EHsc /fp:fast")
|
||||
elseif(CMAKE_COMPILER_IS_GNUCC)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GCC_OPTIM_FLAGS}")
|
||||
set(RTTI_DISABLE_FLAGS "-fno-rtti -DBOOST_NO_RTTI -DBOOST_NO_TYPEID")
|
||||
set(CYCLES_OPTIMIZED_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -DGOGOGO")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(UNIX AND NOT APPLE)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GCC_OPTIM_FLAGS}")
|
||||
set(RTTI_DISABLE_FLAGS "-fno-rtti -DBOOST_NO_RTTI -DBOOST_NO_TYPEID")
|
||||
endif()
|
||||
|
||||
# not needed yet, is for open shading language
|
||||
set(RTTI_DISABLE_FLAGS "")
|
||||
# for OSL, not needed yet
|
||||
# set(RTTI_DISABLE_FLAGS "-fno-rtti -DBOOST_NO_RTTI -DBOOST_NO_TYPEID")
|
||||
# set(RTTI_DISABLE_FLAGS "/GR- -DBOOST_NO_RTTI -DBOOST_NO_TYPEID")
|
||||
|
||||
# Definitions and Includes
|
||||
|
||||
@@ -42,6 +29,10 @@ add_definitions(${BOOST_DEFINITIONS} ${OPENIMAGEIO_DEFINITIONS})
|
||||
add_definitions(-DCCL_NAMESPACE_BEGIN=namespace\ ccl\ {)
|
||||
add_definitions(-DCCL_NAMESPACE_END=})
|
||||
|
||||
if(WITH_CYCLES_OPTIMIZED_KERNEL)
|
||||
add_definitions(-DWITH_OPTIMIZED_KERNEL)
|
||||
endif()
|
||||
|
||||
if(WITH_CYCLES_NETWORK)
|
||||
add_definitions(-DWITH_NETWORK)
|
||||
endif()
|
||||
|
@@ -10,11 +10,10 @@ sources = cycles.Glob('bvh/*.cpp') + cycles.Glob('device/*.cpp') + cycles.Glob('
|
||||
|
||||
sources.remove(path.join('util', 'util_view.cpp'))
|
||||
sources.remove(path.join('render', 'film_response.cpp'))
|
||||
sources.remove(path.join('kernel', 'kernel_optimized.cpp'))
|
||||
|
||||
incs = []
|
||||
defs = []
|
||||
ccflags = []
|
||||
cxxflags = []
|
||||
|
||||
defs.append('CCL_NAMESPACE_BEGIN=namespace ccl {')
|
||||
defs.append('CCL_NAMESPACE_END=}')
|
||||
@@ -23,14 +22,6 @@ defs.append('WITH_OPENCL')
|
||||
defs.append('WITH_MULTI')
|
||||
defs.append('WITH_CUDA')
|
||||
|
||||
if env['OURPLATFORM'] in ('win32-mingw'):
|
||||
if env['WITH_BF_RAYOPTIMIZATION']:
|
||||
cxxflags.append('-ffast-math -msse -msse2 -msse3'.split())
|
||||
ccflags.append('-ffast-math -msse -msse2 -msse3'.split())
|
||||
# not needed yet, is for open shading language
|
||||
# cxxflags.append('-fno-rtti'.split())
|
||||
# defs.append('BOOST_NO_RTTI BOOST_NO_TYPEID'.split())
|
||||
|
||||
incs.extend('. bvh render device kernel kernel/osl kernel/svm util subd'.split())
|
||||
incs.extend('#intern/guardedalloc #source/blender/makesrna #source/blender/makesdna'.split())
|
||||
incs.extend('#source/blender/blenloader ../../source/blender/makesrna/intern'.split())
|
||||
@@ -39,5 +30,20 @@ incs.append(cycles['BF_OIIO_INC'])
|
||||
incs.append(cycles['BF_BOOST_INC'])
|
||||
incs.append(cycles['BF_PYTHON_INC'])
|
||||
|
||||
cycles.BlenderLib('bf_intern_cycles', sources, incs, defs, libtype=['intern'], priority=[0], compileflags=[None], cc_compileflags=ccflags, cxx_compileflags=cxxflags)
|
||||
# optimized kernel
|
||||
if env['WITH_BF_RAYOPTIMIZATION']:
|
||||
optim_cxxflags = []
|
||||
|
||||
if env['OURPLATFORM'] in ('win32-vc', 'win64-vc'):
|
||||
optim_cxxflags.append('/Ox /Ot /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /EHsc /fp:fast'.split())
|
||||
else:
|
||||
optim_cxxflags.append('-ffast-math -msse -msse2 -msse3'.split())
|
||||
|
||||
optim_defs = defs + ['WITH_OPTIMIZED_KERNEL']
|
||||
optim_sources = [path.join('kernel', 'kernel_optimized.cpp')]
|
||||
|
||||
cycles_optim = cycles.Clone()
|
||||
cycles_optim.BlenderLib('bf_intern_cycles_optimized', optim_sources, incs, optim_defs, libtype=['intern'], priority=[0], compileflags=[None], cxx_compileflags=optim_cxxflags)
|
||||
|
||||
cycles.BlenderLib('bf_intern_cycles', sources, incs, defs, libtype=['intern'], priority=[0], compileflags=[None])
|
||||
|
||||
|
@@ -48,6 +48,9 @@ public:
|
||||
{
|
||||
kg = kernel_globals_create();
|
||||
|
||||
/* do now to avoid thread issues */
|
||||
system_cpu_support_optimized();
|
||||
|
||||
if(threads_num == 0)
|
||||
threads_num = system_cpu_thread_count();
|
||||
|
||||
@@ -155,12 +158,26 @@ public:
|
||||
OSLShader::thread_init(kg);
|
||||
#endif
|
||||
|
||||
for(int y = task.y; y < task.y + task.h; y++) {
|
||||
for(int x = task.x; x < task.x + task.w; x++)
|
||||
kernel_cpu_path_trace(kg, (float4*)task.buffer, (unsigned int*)task.rng_state, task.sample, x, y);
|
||||
#ifdef WITH_OPTIMIZED_KERNEL
|
||||
if(system_cpu_support_optimized()) {
|
||||
for(int y = task.y; y < task.y + task.h; y++) {
|
||||
for(int x = task.x; x < task.x + task.w; x++)
|
||||
kernel_cpu_optimized_path_trace(kg, (float4*)task.buffer, (unsigned int*)task.rng_state, task.sample, x, y);
|
||||
|
||||
if(tasks.worker_cancel())
|
||||
break;
|
||||
if(tasks.worker_cancel())
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
for(int y = task.y; y < task.y + task.h; y++) {
|
||||
for(int x = task.x; x < task.x + task.w; x++)
|
||||
kernel_cpu_path_trace(kg, (float4*)task.buffer, (unsigned int*)task.rng_state, task.sample, x, y);
|
||||
|
||||
if(tasks.worker_cancel())
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef WITH_OSL
|
||||
@@ -171,9 +188,18 @@ public:
|
||||
|
||||
void thread_tonemap(DeviceTask& task)
|
||||
{
|
||||
for(int y = task.y; y < task.y + task.h; y++) {
|
||||
for(int x = task.x; x < task.x + task.w; x++)
|
||||
kernel_cpu_tonemap(kg, (uchar4*)task.rgba, (float4*)task.buffer, task.sample, task.resolution, x, y);
|
||||
#ifdef WITH_OPTIMIZED_KERNEL
|
||||
if(system_cpu_support_optimized()) {
|
||||
for(int y = task.y; y < task.y + task.h; y++)
|
||||
for(int x = task.x; x < task.x + task.w; x++)
|
||||
kernel_cpu_optimized_tonemap(kg, (uchar4*)task.rgba, (float4*)task.buffer, task.sample, task.resolution, x, y);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
for(int y = task.y; y < task.y + task.h; y++)
|
||||
for(int x = task.x; x < task.x + task.w; x++)
|
||||
kernel_cpu_tonemap(kg, (uchar4*)task.rgba, (float4*)task.buffer, task.sample, task.resolution, x, y);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -184,11 +210,24 @@ public:
|
||||
OSLShader::thread_init(kg);
|
||||
#endif
|
||||
|
||||
for(int x = task.displace_x; x < task.displace_x + task.displace_w; x++) {
|
||||
kernel_cpu_displace(kg, (uint4*)task.displace_input, (float3*)task.displace_offset, x);
|
||||
#ifdef WITH_OPTIMIZED_KERNEL
|
||||
if(system_cpu_support_optimized()) {
|
||||
for(int x = task.displace_x; x < task.displace_x + task.displace_w; x++) {
|
||||
kernel_cpu_optimized_displace(kg, (uint4*)task.displace_input, (float3*)task.displace_offset, x);
|
||||
|
||||
if(tasks.worker_cancel())
|
||||
break;
|
||||
if(tasks.worker_cancel())
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
for(int x = task.displace_x; x < task.displace_x + task.displace_w; x++) {
|
||||
kernel_cpu_displace(kg, (uint4*)task.displace_input, (float3*)task.displace_offset, x);
|
||||
|
||||
if(tasks.worker_cancel())
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef WITH_OSL
|
||||
|
@@ -8,6 +8,7 @@ set(INC
|
||||
|
||||
set(SRC
|
||||
kernel.cpp
|
||||
kernel_optimized.cpp
|
||||
kernel.cl
|
||||
kernel.cu
|
||||
)
|
||||
@@ -123,11 +124,15 @@ include_directories(${INC})
|
||||
|
||||
add_library(cycles_kernel ${SRC} ${SRC_HEADERS} ${SRC_SVM_HEADERS})
|
||||
|
||||
if(WITH_CYCLES_OPTIMIZED_KERNEL)
|
||||
SET_SOURCE_FILES_PROPERTIES(kernel_optimized.cpp PROPERTIES COMPILE_FLAGS ${CYCLES_OPTIMIZED_KERNEL_FLAGS})
|
||||
endif()
|
||||
|
||||
if(WITH_CYCLES_CUDA)
|
||||
add_dependencies(cycles_kernel cycles_kernel_cuda)
|
||||
endif()
|
||||
|
||||
# OPENCL kernel
|
||||
# OpenCL kernel
|
||||
|
||||
#set(KERNEL_PREPROCESSED ${CMAKE_CURRENT_BINARY_DIR}/kernel_preprocessed.cl)
|
||||
#add_custom_command(
|
||||
@@ -142,3 +147,4 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel.cu" ${CYCLES_INSTALL_PATH}/k
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/svm)
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
|
||||
|
||||
|
@@ -38,9 +38,14 @@ void kernel_tex_copy(KernelGlobals *kg, const char *name, device_ptr mem, size_t
|
||||
|
||||
void kernel_cpu_path_trace(KernelGlobals *kg, float4 *buffer, unsigned int *rng_state, int sample, int x, int y);
|
||||
void kernel_cpu_tonemap(KernelGlobals *kg, uchar4 *rgba, float4 *buffer, int sample, int resolution, int x, int y);
|
||||
|
||||
void kernel_cpu_displace(KernelGlobals *kg, uint4 *input, float3 *offset, int i);
|
||||
|
||||
#ifdef WITH_OPTIMIZED_KERNEL
|
||||
void kernel_cpu_optimized_path_trace(KernelGlobals *kg, float4 *buffer, unsigned int *rng_state, int sample, int x, int y);
|
||||
void kernel_cpu_optimized_tonemap(KernelGlobals *kg, uchar4 *rgba, float4 *buffer, int sample, int resolution, int x, int y);
|
||||
void kernel_cpu_optimized_displace(KernelGlobals *kg, uint4 *input, float3 *offset, int i);
|
||||
#endif
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
#endif /* __KERNEL_H__ */
|
||||
|
60
intern/cycles/kernel/kernel_optimized.cpp
Normal file
60
intern/cycles/kernel/kernel_optimized.cpp
Normal file
@@ -0,0 +1,60 @@
|
||||
/*
|
||||
* Copyright 2011, Blender Foundation.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version 2
|
||||
* of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
*/
|
||||
|
||||
/* Optimized CPU kernel entry points. This file is compiled with SSE3
|
||||
optimization flags and nearly all functions inlined, while kernel.cpp
|
||||
is compiled without for other CPU's. */
|
||||
|
||||
#ifdef WITH_OPTIMIZED_KERNEL
|
||||
|
||||
#include "kernel.h"
|
||||
#include "kernel_compat_cpu.h"
|
||||
#include "kernel_math.h"
|
||||
#include "kernel_types.h"
|
||||
#include "kernel_globals.h"
|
||||
#include "kernel_film.h"
|
||||
#include "kernel_path.h"
|
||||
#include "kernel_displace.h"
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
/* Path Tracing */
|
||||
|
||||
void kernel_cpu_optimized_path_trace(KernelGlobals *kg, float4 *buffer, unsigned int *rng_state, int sample, int x, int y)
|
||||
{
|
||||
kernel_path_trace(kg, buffer, rng_state, sample, x, y);
|
||||
}
|
||||
|
||||
/* Tonemapping */
|
||||
|
||||
void kernel_cpu_optimized_tonemap(KernelGlobals *kg, uchar4 *rgba, float4 *buffer, int sample, int resolution, int x, int y)
|
||||
{
|
||||
kernel_film_tonemap(kg, rgba, buffer, sample, resolution, x, y);
|
||||
}
|
||||
|
||||
/* Displacement */
|
||||
|
||||
void kernel_cpu_optimized_displace(KernelGlobals *kg, uint4 *input, float3 *offset, int i)
|
||||
{
|
||||
kernel_displace(kg, input, offset, i);
|
||||
}
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
#endif
|
||||
|
@@ -118,5 +118,78 @@ int system_cpu_bits()
|
||||
return (sizeof(void*)*8);
|
||||
}
|
||||
|
||||
#if defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(_M_IX86)
|
||||
|
||||
struct CPUCapabilities {
|
||||
bool x64;
|
||||
bool mmx;
|
||||
bool sse;
|
||||
bool sse2;
|
||||
bool sse3;
|
||||
bool ssse3;
|
||||
bool sse41;
|
||||
bool sse42;
|
||||
bool sse4a;
|
||||
bool avx;
|
||||
bool xop;
|
||||
bool fma3;
|
||||
bool fma4;
|
||||
};
|
||||
|
||||
bool system_cpu_support_optimized()
|
||||
{
|
||||
static CPUCapabilities caps;
|
||||
static bool caps_init = false;
|
||||
|
||||
if(!caps_init) {
|
||||
int result[4], num, num_ex;
|
||||
|
||||
memset(&caps, 0, sizeof(caps));
|
||||
|
||||
__cpuid(result, 0);
|
||||
num = result[0];
|
||||
|
||||
__cpuid(result, 0x80000000);
|
||||
num_ex = result[0];
|
||||
|
||||
if(num >= 1){
|
||||
__cpuid(result, 0x00000001);
|
||||
caps.mmx = (result[3] & ((int)1 << 23)) != 0;
|
||||
caps.sse = (result[3] & ((int)1 << 25)) != 0;
|
||||
caps.sse2 = (result[3] & ((int)1 << 26)) != 0;
|
||||
caps.sse3 = (result[2] & ((int)1 << 0)) != 0;
|
||||
|
||||
caps.ssse3 = (result[2] & ((int)1 << 9)) != 0;
|
||||
caps.sse41 = (result[2] & ((int)1 << 19)) != 0;
|
||||
caps.sse42 = (result[2] & ((int)1 << 20)) != 0;
|
||||
|
||||
caps.avx = (result[2] & ((int)1 << 28)) != 0;
|
||||
caps.fma3 = (result[2] & ((int)1 << 12)) != 0;
|
||||
}
|
||||
|
||||
/*if(num_ex >= 0x80000001){
|
||||
__cpuid(result, 0x80000001);
|
||||
caps.x64 = (result[3] & ((int)1 << 29)) != 0;
|
||||
caps.sse4a = (result[2] & ((int)1 << 6)) != 0;
|
||||
caps.fma4 = (result[2] & ((int)1 << 16)) != 0;
|
||||
caps.xop = (result[2] & ((int)1 << 11)) != 0;
|
||||
}*/
|
||||
|
||||
caps_init = true;
|
||||
}
|
||||
|
||||
/* optimization flags use these */
|
||||
return caps.sse && caps.sse2 && caps.sse3;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
bool system_cpu_support_optimized()
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
|
@@ -26,6 +26,7 @@ CCL_NAMESPACE_BEGIN
|
||||
int system_cpu_thread_count();
|
||||
string system_cpu_brand_string();
|
||||
int system_cpu_bits();
|
||||
bool system_cpu_support_optimized();
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
|
Reference in New Issue
Block a user