Code cleanup: cycles
* Reshuffle SSE #ifdefs to try to avoid compilation errors enabling SSE on 32 bit. * Remove CUDA kernel launch size exception on Mac, is not needed. * Make OSL file compilation quiet like c/cpp files.
This commit is contained in:
@@ -607,13 +607,8 @@ public:
|
|||||||
cuda_assert(cuParamSetSize(cuPathTrace, offset))
|
cuda_assert(cuParamSetSize(cuPathTrace, offset))
|
||||||
|
|
||||||
/* launch kernel: todo find optimal size, cache config for fermi */
|
/* launch kernel: todo find optimal size, cache config for fermi */
|
||||||
#ifndef __APPLE__
|
|
||||||
int xthreads = 16;
|
int xthreads = 16;
|
||||||
int ythreads = 16;
|
int ythreads = 16;
|
||||||
#else
|
|
||||||
int xthreads = 8;
|
|
||||||
int ythreads = 8;
|
|
||||||
#endif
|
|
||||||
int xblocks = (rtile.w + xthreads - 1)/xthreads;
|
int xblocks = (rtile.w + xthreads - 1)/xthreads;
|
||||||
int yblocks = (rtile.h + ythreads - 1)/ythreads;
|
int yblocks = (rtile.h + ythreads - 1)/ythreads;
|
||||||
|
|
||||||
@@ -676,13 +671,8 @@ public:
|
|||||||
cuda_assert(cuParamSetSize(cuFilmConvert, offset))
|
cuda_assert(cuParamSetSize(cuFilmConvert, offset))
|
||||||
|
|
||||||
/* launch kernel: todo find optimal size, cache config for fermi */
|
/* launch kernel: todo find optimal size, cache config for fermi */
|
||||||
#ifndef __APPLE__
|
|
||||||
int xthreads = 16;
|
int xthreads = 16;
|
||||||
int ythreads = 16;
|
int ythreads = 16;
|
||||||
#else
|
|
||||||
int xthreads = 8;
|
|
||||||
int ythreads = 8;
|
|
||||||
#endif
|
|
||||||
int xblocks = (task.w + xthreads - 1)/xthreads;
|
int xblocks = (task.w + xthreads - 1)/xthreads;
|
||||||
int yblocks = (task.h + ythreads - 1)/ythreads;
|
int yblocks = (task.h + ythreads - 1)/ythreads;
|
||||||
|
|
||||||
@@ -730,11 +720,7 @@ public:
|
|||||||
cuda_assert(cuParamSetSize(cuDisplace, offset))
|
cuda_assert(cuParamSetSize(cuDisplace, offset))
|
||||||
|
|
||||||
/* launch kernel: todo find optimal size, cache config for fermi */
|
/* launch kernel: todo find optimal size, cache config for fermi */
|
||||||
#ifndef __APPLE__
|
|
||||||
int xthreads = 16;
|
int xthreads = 16;
|
||||||
#else
|
|
||||||
int xthreads = 8;
|
|
||||||
#endif
|
|
||||||
int xblocks = (task.shader_w + xthreads - 1)/xthreads;
|
int xblocks = (task.shader_w + xthreads - 1)/xthreads;
|
||||||
|
|
||||||
cuda_assert(cuFuncSetCacheConfig(cuDisplace, CU_FUNC_CACHE_PREFER_L1))
|
cuda_assert(cuFuncSetCacheConfig(cuDisplace, CU_FUNC_CACHE_PREFER_L1))
|
||||||
|
@@ -108,7 +108,6 @@
|
|||||||
|
|
||||||
/* data lookup defines */
|
/* data lookup defines */
|
||||||
#define kernel_data (*kg->data)
|
#define kernel_data (*kg->data)
|
||||||
#define kernel_tex_lookup(t, x, offset, size) kernel_tex_lookup_(kg->t, offset, size, x)
|
|
||||||
#define kernel_tex_fetch(t, index) kg->t[index]
|
#define kernel_tex_fetch(t, index) kg->t[index]
|
||||||
|
|
||||||
/* define NULL */
|
/* define NULL */
|
||||||
|
@@ -22,7 +22,7 @@
|
|||||||
|
|
||||||
#ifdef WITH_OPTIMIZED_KERNEL
|
#ifdef WITH_OPTIMIZED_KERNEL
|
||||||
|
|
||||||
//#define __KERNEL_SSE2__
|
#define __KERNEL_SSE2__
|
||||||
|
|
||||||
#include "kernel.h"
|
#include "kernel.h"
|
||||||
#include "kernel_compat_cpu.h"
|
#include "kernel_compat_cpu.h"
|
||||||
|
@@ -91,7 +91,7 @@ foreach(_file ${SRC_OSL})
|
|||||||
string(REPLACE ${CMAKE_SOURCE_DIR} ${CMAKE_BINARY_DIR} _OSO_FILE ${_OSO_FILE})
|
string(REPLACE ${CMAKE_SOURCE_DIR} ${CMAKE_BINARY_DIR} _OSO_FILE ${_OSO_FILE})
|
||||||
add_custom_command(
|
add_custom_command(
|
||||||
OUTPUT ${_OSO_FILE}
|
OUTPUT ${_OSO_FILE}
|
||||||
COMMAND ${OSL_COMPILER} -O2 -I"${CMAKE_CURRENT_SOURCE_DIR}" ${_OSL_FILE}
|
COMMAND ${OSL_COMPILER} -q -O2 -I"${CMAKE_CURRENT_SOURCE_DIR}" ${_OSL_FILE}
|
||||||
DEPENDS ${_OSL_FILE} ${SRC_OSL_HEADERS})
|
DEPENDS ${_OSL_FILE} ${SRC_OSL_HEADERS})
|
||||||
list(APPEND SRC_OSO
|
list(APPEND SRC_OSO
|
||||||
${_OSO_FILE}
|
${_OSO_FILE}
|
||||||
|
@@ -57,7 +57,7 @@ if env['WITH_BF_CYCLES_OSL']:
|
|||||||
osl_file = os.path.join(source_dir, f)
|
osl_file = os.path.join(source_dir, f)
|
||||||
oso_file = os.path.join(build_dir, f.replace('.osl', '.oso'))
|
oso_file = os.path.join(build_dir, f.replace('.osl', '.oso'))
|
||||||
|
|
||||||
command = "%s -O2 -I%s -o %s %s" % (osl_compiler, source_dir, oso_file, osl_file)
|
command = "%s -q -O2 -I%s -o %s %s" % (osl_compiler, source_dir, oso_file, osl_file)
|
||||||
|
|
||||||
shaders.Command(oso_file, f, command)
|
shaders.Command(oso_file, f, command)
|
||||||
shaders.Depends(oso_file, [f] + dependencies)
|
shaders.Depends(oso_file, [f] + dependencies)
|
||||||
|
@@ -61,59 +61,49 @@
|
|||||||
|
|
||||||
#ifndef __KERNEL_GPU__
|
#ifndef __KERNEL_GPU__
|
||||||
|
|
||||||
/* not enabled, globally applying it just gives slowdown,
|
/* not enabled, globally applying it gives slowdown, only for testing. */
|
||||||
* but useful for testing. */
|
#if 0
|
||||||
//#define __KERNEL_SSE__
|
#define __KERNEL_SSE__
|
||||||
#ifdef __KERNEL_SSE__
|
|
||||||
|
|
||||||
#include <xmmintrin.h> /* SSE 1 */
|
|
||||||
#include <emmintrin.h> /* SSE 2 */
|
|
||||||
#include <pmmintrin.h> /* SSE 3 */
|
|
||||||
#include <tmmintrin.h> /* SSSE 3 */
|
|
||||||
#include <smmintrin.h> /* SSE 4 */
|
|
||||||
|
|
||||||
#ifndef __KERNEL_SSE2__
|
#ifndef __KERNEL_SSE2__
|
||||||
#define __KERNEL_SSE2__
|
#define __KERNEL_SSE2__
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef __KERNEL_SSE3__
|
#ifndef __KERNEL_SSE3__
|
||||||
#define __KERNEL_SSE3__
|
#define __KERNEL_SSE3__
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef __KERNEL_SSSE3__
|
#ifndef __KERNEL_SSSE3__
|
||||||
#define __KERNEL_SSSE3__
|
#define __KERNEL_SSSE3__
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef __KERNEL_SSE4__
|
#ifndef __KERNEL_SSE4__
|
||||||
#define __KERNEL_SSE4__
|
#define __KERNEL_SSE4__
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#else
|
/* SSE2 is always available on x86_64 CPUs, so auto enable */
|
||||||
|
#if defined(__x86_64__) && !defined(__KERNEL_SSE2__)
|
||||||
|
#define __KERNEL_SSE2__
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(__x86_64__) || defined(__KERNEL_SSSE3__)
|
/* SSE intrinsics headers */
|
||||||
|
#ifndef FREE_WINDOWS64
|
||||||
|
|
||||||
/* MinGW64 has conflicting declarations for these SSE headers in <windows.h>.
|
#ifdef __KERNEL_SSE2__
|
||||||
* Since we can't avoid including <windows.h>, better only include that */
|
|
||||||
#ifdef FREE_WINDOWS64
|
|
||||||
#include <windows.h>
|
|
||||||
#else
|
|
||||||
#include <xmmintrin.h> /* SSE 1 */
|
#include <xmmintrin.h> /* SSE 1 */
|
||||||
#include <emmintrin.h> /* SSE 2 */
|
#include <emmintrin.h> /* SSE 2 */
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef __KERNEL_SSE3__
|
#ifdef __KERNEL_SSE3__
|
||||||
#include <pmmintrin.h> /* SSE 3 */
|
#include <pmmintrin.h> /* SSE 3 */
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __KERNEL_SSSE3__
|
#ifdef __KERNEL_SSSE3__
|
||||||
#include <tmmintrin.h> /* SSSE 3 */
|
#include <tmmintrin.h> /* SSSE 3 */
|
||||||
#endif
|
#endif
|
||||||
#endif
|
|
||||||
|
|
||||||
/* SSE2 is available on x64 and SSE3 CPUs, so enable here as well */
|
#else
|
||||||
#ifndef __KERNEL_SSE2__
|
|
||||||
#define __KERNEL_SSE2__
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif
|
/* MinGW64 has conflicting declarations for these SSE headers in <windows.h>.
|
||||||
|
* Since we can't avoid including <windows.h>, better only include that */
|
||||||
|
#include <windows.h>
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@@ -553,30 +543,6 @@ template<size_t i0, size_t i1, size_t i2, size_t i3> __device_inline const __m12
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef __KERNEL_GPU__
|
|
||||||
|
|
||||||
static inline void *malloc_aligned(size_t size, size_t alignment)
|
|
||||||
{
|
|
||||||
void *data = (void*)malloc(size + sizeof(void*) + alignment - 1);
|
|
||||||
|
|
||||||
union { void *ptr; size_t offset; } u;
|
|
||||||
u.ptr = (char*)data + sizeof(void*);
|
|
||||||
u.offset = (u.offset + alignment - 1) & ~(alignment - 1);
|
|
||||||
*(((void**)u.ptr) - 1) = data;
|
|
||||||
|
|
||||||
return u.ptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline void free_aligned(void *ptr)
|
|
||||||
{
|
|
||||||
if(ptr) {
|
|
||||||
void *data = *(((void**)ptr) - 1);
|
|
||||||
free(data);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
CCL_NAMESPACE_END
|
CCL_NAMESPACE_END
|
||||||
|
|
||||||
#endif /* __UTIL_TYPES_H__ */
|
#endif /* __UTIL_TYPES_H__ */
|
||||||
|
@@ -30,6 +30,26 @@ CCL_NAMESPACE_BEGIN
|
|||||||
|
|
||||||
using std::vector;
|
using std::vector;
|
||||||
|
|
||||||
|
static inline void *malloc_aligned(size_t size, size_t alignment)
|
||||||
|
{
|
||||||
|
void *data = (void*)malloc(size + sizeof(void*) + alignment - 1);
|
||||||
|
|
||||||
|
union { void *ptr; size_t offset; } u;
|
||||||
|
u.ptr = (char*)data + sizeof(void*);
|
||||||
|
u.offset = (u.offset + alignment - 1) & ~(alignment - 1);
|
||||||
|
*(((void**)u.ptr) - 1) = data;
|
||||||
|
|
||||||
|
return u.ptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void free_aligned(void *ptr)
|
||||||
|
{
|
||||||
|
if(ptr) {
|
||||||
|
void *data = *(((void**)ptr) - 1);
|
||||||
|
free(data);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* Array
|
/* Array
|
||||||
*
|
*
|
||||||
* Simplified version of vector, serving multiple purposes:
|
* Simplified version of vector, serving multiple purposes:
|
||||||
|
Reference in New Issue
Block a user