From 45dcd20ca9e1f60c51e7752560b0042128740d69 Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brechtvanlommel@gmail.com>
Date: Sat, 5 Aug 2017 04:06:39 +0200
Subject: [PATCH 01/11] Cycles: CUDA split performance tweaks, still far from
 megakernel.

On Pabellon, 25.8s mega, 35.4s split before, 32.7s split after.
---
 intern/cycles/device/device_cuda.cpp              | 10 +++-------
 intern/cycles/kernel/kernels/cuda/kernel_config.h |  9 +++++++--
 intern/cycles/kernel/kernels/cuda/kernel_split.cu |  4 ++--
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 3a29538aa13..dbf636e1405 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -1898,17 +1898,13 @@ public:
 		int threads_per_block;
 		cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
 
-		int xthreads = (int)sqrt(threads_per_block);
-		int ythreads = (int)sqrt(threads_per_block);
-
-		int xblocks = (dim.global_size[0] + xthreads - 1)/xthreads;
-		int yblocks = (dim.global_size[1] + ythreads - 1)/ythreads;
+		int xblocks = (dim.global_size[0]*dim.global_size[1] + threads_per_block - 1)/threads_per_block;
 
 		cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1));
 
 		cuda_assert(cuLaunchKernel(func,
-		                           xblocks , yblocks, 1, /* blocks */
-		                           xthreads, ythreads, 1, /* threads */
+		                           xblocks, 1, 1, /* blocks */
+		                           threads_per_block, 1, 1, /* threads */
 		                           0, 0, args, 0));
 
 		device->cuda_pop_context();
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_config.h b/intern/cycles/kernel/kernels/cuda/kernel_config.h
index 9fa39dc9ebb..7ae205b7e14 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel_config.h
+++ b/intern/cycles/kernel/kernels/cuda/kernel_config.h
@@ -81,8 +81,13 @@
 #  error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
 #endif
 
-/* compute number of threads per block and minimum blocks per multiprocessor
- * given the maximum number of registers per thread */
+/* For split kernel using all registers seems fastest for now, but this
+ * is unlikely to be optimal once we resolve other bottlenecks. */
+
+#define CUDA_KERNEL_SPLIT_MAX_REGISTERS CUDA_THREAD_MAX_REGISTERS
+
+/* Compute number of threads per block and minimum blocks per multiprocessor
+ * given the maximum number of registers per thread. */
 
 #define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
 	__launch_bounds__( \
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
index 628891b1458..e97e87285a5 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel_split.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
@@ -90,7 +90,7 @@ kernel_cuda_path_trace_data_init(
 
 #define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
 	extern "C" __global__ void \
-	CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) \
+	CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \
 	kernel_cuda_##name() \
 	{ \
 		kernel_##name(NULL); \
@@ -98,7 +98,7 @@ kernel_cuda_path_trace_data_init(
 
 #define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
 	extern "C" __global__ void \
-	CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) \
+	CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \
 	kernel_cuda_##name() \
 	{ \
 		ccl_local type locals; \

From f5f6f9c9ac3ec37ef98eae54464a801c2b7ddcc9 Mon Sep 17 00:00:00 2001
From: Bastien Montagne <montagne29@wanadoo.fr>
Date: Sun, 6 Aug 2017 17:35:41 +0200
Subject: [PATCH 02/11] Fix broken API doc generation: Partially revert
 rBa372638a76e0

Making those arrays static remove them from exported symbols, which
breaks API doc generation script.

To be backported to 2.79 branch.
---
 source/blender/editors/space_clip/space_clip.c           | 3 ++-
 source/blender/editors/space_sequencer/space_sequencer.c | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/source/blender/editors/space_clip/space_clip.c b/source/blender/editors/space_clip/space_clip.c
index 597b8be89b2..58930fa2cf2 100644
--- a/source/blender/editors/space_clip/space_clip.c
+++ b/source/blender/editors/space_clip/space_clip.c
@@ -821,7 +821,8 @@ static void clip_keymap(struct wmKeyConfig *keyconf)
 #endif
 }
 
-static const char *clip_context_dir[] = {"edit_movieclip", "edit_mask", NULL};
+/* DO NOT make this static, this hides the symbol and breaks API generation script. */
+const char *clip_context_dir[] = {"edit_movieclip", "edit_mask", NULL};
 
 static int clip_context(const bContext *C, const char *member, bContextDataResult *result)
 {
diff --git a/source/blender/editors/space_sequencer/space_sequencer.c b/source/blender/editors/space_sequencer/space_sequencer.c
index f965c1af54a..f1d0f23f8af 100644
--- a/source/blender/editors/space_sequencer/space_sequencer.c
+++ b/source/blender/editors/space_sequencer/space_sequencer.c
@@ -435,7 +435,8 @@ static void sequencer_dropboxes(void)
 
 /* ************* end drop *********** */
 
-static const char *sequencer_context_dir[] = {"edit_mask", NULL};
+/* DO NOT make this static, this hides the symbol and breaks API generation script. */
+const char *sequencer_context_dir[] = {"edit_mask", NULL};
 
 static int sequencer_context(const bContext *C, const char *member, bContextDataResult *result)
 {

From 2a74f36dacb5a6294e2e8d602e055cf9cd9fa702 Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brechtvanlommel@gmail.com>
Date: Sat, 5 Aug 2017 15:46:40 +0200
Subject: [PATCH 03/11] Fix Cycles CUDA adaptive megakernel build error.

---
 intern/cycles/kernel/kernel_path.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index c340b3bc968..8f6c2b07381 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -100,6 +100,8 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
 
 #ifndef __SPLIT_KERNEL__
 
+#if defined(__BRANCHED_PATH__) || defined(__BAKING__)
+
 ccl_device void kernel_path_indirect(KernelGlobals *kg,
                                      ShaderData *sd,
                                      ShaderData *emission_sd,
@@ -428,6 +430,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 	}
 }
 
+#endif /* defined(__BRANCHED_PATH__) || defined(__BAKING__) */
 
 ccl_device_inline float kernel_path_integrate(KernelGlobals *kg,
                                               RNG *rng,

From d1752167a9fd86403b7c197fff933ebf718065bd Mon Sep 17 00:00:00 2001
From: Campbell Barton <ideasman42@gmail.com>
Date: Mon, 7 Aug 2017 21:17:14 +1000
Subject: [PATCH 04/11] Fix T52278: 'Default' application template fails

Own error in 7398b3b7
---
 source/blender/windowmanager/intern/wm_files.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/source/blender/windowmanager/intern/wm_files.c b/source/blender/windowmanager/intern/wm_files.c
index b4a6366fb4e..605a226500f 100644
--- a/source/blender/windowmanager/intern/wm_files.c
+++ b/source/blender/windowmanager/intern/wm_files.c
@@ -721,20 +721,20 @@ int wm_homefile_read(
 	if (filepath_startup_override != NULL) {
 		/* pass */
 	}
-	else if (app_template_override && app_template_override[0]) {
+	else if (app_template_override) {
+		/* This may be clearing the current template by setting to an empty string. */
 		app_template = app_template_override;
 	}
 	else if (!use_factory_settings && U.app_template[0]) {
 		app_template = U.app_template;
 	}
 
-	if (app_template != NULL) {
+	if ((app_template != NULL) && (app_template[0] != '\0')) {
 		BKE_appdir_app_template_id_search(app_template, app_template_system, sizeof(app_template_system));
 		BLI_path_join(app_template_config, sizeof(app_template_config), cfgdir, app_template, NULL);
-	}
 
-	/* insert template name into startup file */
-	if (app_template != NULL) {
+		/* Insert template name into startup file. */
+
 		/* note that the path is being set even when 'use_factory_settings == true'
 		 * this is done so we can load a templates factory-settings */
 		if (!use_factory_settings) {

From 5e4bad2c00e466bfc013f8ad9edac7ad66938001 Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brechtvanlommel@gmail.com>
Date: Wed, 2 Aug 2017 15:23:50 +0200
Subject: [PATCH 05/11] Cycles: remove option to disable transparent shadows
 globally.

We already detect this automatically based on shading nodes and per shader
settings, and performance of this option is ok now all devices.

Differential Revision: https://developer.blender.org/D2767
---
 intern/cycles/blender/addon/presets.py    |  1 -
 intern/cycles/blender/addon/properties.py |  5 -----
 intern/cycles/blender/addon/ui.py         |  1 -
 intern/cycles/blender/blender_sync.cpp    |  1 -
 intern/cycles/render/integrator.cpp       | 18 ++++++------------
 intern/cycles/render/integrator.h         |  1 -
 intern/cycles/render/session.cpp          |  1 -
 intern/cycles/render/shader.cpp           |  4 +---
 8 files changed, 7 insertions(+), 25 deletions(-)

diff --git a/intern/cycles/blender/addon/presets.py b/intern/cycles/blender/addon/presets.py
index 440221b8470..17efb00abdb 100644
--- a/intern/cycles/blender/addon/presets.py
+++ b/intern/cycles/blender/addon/presets.py
@@ -37,7 +37,6 @@ class AddPresetIntegrator(AddPresetBase, Operator):
         "cycles.transmission_bounces",
         "cycles.volume_bounces",
         "cycles.transparent_max_bounces",
-        "cycles.use_transparent_shadows",
         "cycles.caustics_reflective",
         "cycles.caustics_refractive",
         "cycles.blur_glossy"
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index cb5b13f8d3a..c744c1d6932 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -349,11 +349,6 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
                 min=0, max=1024,
                 default=8,
                 )
-        cls.use_transparent_shadows = BoolProperty(
-                name="Transparent Shadows",
-                description="Use transparency of surfaces for rendering shadows",
-                default=True,
-                )
 
         cls.volume_step_size = FloatProperty(
                 name="Step Size",
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index 45a77aa8c38..b9565aa4c7f 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -292,7 +292,6 @@ class CyclesRender_PT_light_paths(CyclesButtonsPanel, Panel):
         sub = col.column(align=True)
         sub.label("Transparency:")
         sub.prop(cscene, "transparent_max_bounces", text="Max")
-        sub.prop(cscene, "use_transparent_shadows", text="Shadows")
 
         col.separator()
 
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index ddaa962f63d..e953c685b56 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -242,7 +242,6 @@ void BlenderSync::sync_integrator()
 	integrator->max_volume_bounce = get_int(cscene, "volume_bounces");
 
 	integrator->transparent_max_bounce = get_int(cscene, "transparent_max_bounces");
-	integrator->transparent_shadows = get_boolean(cscene, "use_transparent_shadows");
 
 	integrator->volume_max_steps = get_int(cscene, "volume_max_steps");
 	integrator->volume_step_size = get_float(cscene, "volume_step_size");
diff --git a/intern/cycles/render/integrator.cpp b/intern/cycles/render/integrator.cpp
index b9b8c681a26..15b728d6e02 100644
--- a/intern/cycles/render/integrator.cpp
+++ b/intern/cycles/render/integrator.cpp
@@ -39,7 +39,6 @@ NODE_DEFINE(Integrator)
 	SOCKET_INT(max_volume_bounce, "Max Volume Bounce", 7);
 
 	SOCKET_INT(transparent_max_bounce, "Transparent Max Bounce", 7);
-	SOCKET_BOOLEAN(transparent_shadows, "Transparent Shadows", false);
 
 	SOCKET_INT(ao_bounces, "AO Bounces", 0);
 
@@ -121,19 +120,14 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
 	 * We only need to enable transparent shadows, if we actually have 
 	 * transparent shaders in the scene. Otherwise we can disable it
 	 * to improve performance a bit. */
-	if(transparent_shadows) {
-		kintegrator->transparent_shadows = false;
-		foreach(Shader *shader, scene->shaders) {
-			/* keep this in sync with SD_HAS_TRANSPARENT_SHADOW in shader.cpp */
-			if((shader->has_surface_transparent && shader->use_transparent_shadow) || shader->has_volume) {
-				kintegrator->transparent_shadows = true;
-				break;
-			}
+	kintegrator->transparent_shadows = false;
+	foreach(Shader *shader, scene->shaders) {
+		/* keep this in sync with SD_HAS_TRANSPARENT_SHADOW in shader.cpp */
+		if((shader->has_surface_transparent && shader->use_transparent_shadow) || shader->has_volume) {
+			kintegrator->transparent_shadows = true;
+			break;
 		}
 	}
-	else {
-		kintegrator->transparent_shadows = false;
-	}
 
 	kintegrator->volume_max_steps = volume_max_steps;
 	kintegrator->volume_step_size = volume_step_size;
diff --git a/intern/cycles/render/integrator.h b/intern/cycles/render/integrator.h
index ce5651ec823..3cb430d72b4 100644
--- a/intern/cycles/render/integrator.h
+++ b/intern/cycles/render/integrator.h
@@ -39,7 +39,6 @@ public:
 	int max_volume_bounce;
 
 	int transparent_max_bounce;
-	bool transparent_shadows;
 
 	int ao_bounces;
 
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index 8622318858e..ca3aefcb5e6 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -721,7 +721,6 @@ DeviceRequestedFeatures Session::get_requested_device_features()
 	BakeManager *bake_manager = scene->bake_manager;
 	requested_features.use_baking = bake_manager->get_baking();
 	requested_features.use_integrator_branched = (scene->integrator->method == Integrator::BRANCHED_PATH);
-	requested_features.use_transparent &= scene->integrator->transparent_shadows;
 	requested_features.use_denoising = params.use_denoising;
 
 	return requested_features;
diff --git a/intern/cycles/render/shader.cpp b/intern/cycles/render/shader.cpp
index 44a266dfe18..493e01de363 100644
--- a/intern/cycles/render/shader.cpp
+++ b/intern/cycles/render/shader.cpp
@@ -503,9 +503,7 @@ void ShaderManager::device_update_common(Device *device,
 	KernelIntegrator *kintegrator = &dscene->data.integrator;
 	kintegrator->use_volumes = has_volumes;
 	/* TODO(sergey): De-duplicate with flags set in integrator.cpp. */
-	if(scene->integrator->transparent_shadows) {
-		kintegrator->transparent_shadows = has_transparent_shadow;
-	}
+	kintegrator->transparent_shadows = has_transparent_shadow;
 }
 
 void ShaderManager::device_free_common(Device *device, DeviceScene *dscene, Scene *scene)

From a8cc0d707e82ac781f44bf6cd7ed1e8974d8ed39 Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brechtvanlommel@gmail.com>
Date: Wed, 2 Aug 2017 02:09:08 +0200
Subject: [PATCH 06/11] Code refactor: split defines into separate header,
 changes to SSE type headers.

I need to use some macros defined in util_simd.h for float3/float4, to emulate
SSE4 instructions on SSE2. But due to issues with order of header includes this
was not possible, this does some refactoring to make it work.

Differential Revision: https://developer.blender.org/D2764
---
 intern/cycles/device/device_cpu.cpp    |   1 +
 intern/cycles/kernel/CMakeLists.txt    |   1 +
 intern/cycles/util/CMakeLists.txt      |   1 +
 intern/cycles/util/util_defines.h      | 134 ++++++++++++++++++++++++
 intern/cycles/util/util_optimization.h |  52 ----------
 intern/cycles/util/util_simd.h         |  45 ++++++--
 intern/cycles/util/util_sseb.h         |   3 +
 intern/cycles/util/util_ssef.h         |   3 +
 intern/cycles/util/util_ssei.h         |   9 +-
 intern/cycles/util/util_types.h        | 136 +++----------------------
 10 files changed, 201 insertions(+), 184 deletions(-)
 create mode 100644 intern/cycles/util/util_defines.h

diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 18112437b45..a00be3eeaab 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -48,6 +48,7 @@
 #include "util/util_logging.h"
 #include "util/util_map.h"
 #include "util/util_opengl.h"
+#include "util/util_optimization.h"
 #include "util/util_progress.h"
 #include "util/util_system.h"
 #include "util/util_thread.h"
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 23e9bd311c4..88c4c4e3282 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -233,6 +233,7 @@ set(SRC_FILTER_HEADERS
 set(SRC_UTIL_HEADERS
 	../util/util_atomic.h
 	../util/util_color.h
+	../util/util_defines.h
 	../util/util_half.h
 	../util/util_hash.h
 	../util/util_math.h
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index 43f9a57d099..7f3747a0f58 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -38,6 +38,7 @@ set(SRC_HEADERS
 	util_atomic.h
 	util_boundbox.h
 	util_debug.h
+	util_defines.h
 	util_guarded_allocator.cpp
 	util_foreach.h
 	util_function.h
diff --git a/intern/cycles/util/util_defines.h b/intern/cycles/util/util_defines.h
new file mode 100644
index 00000000000..d0d87e74332
--- /dev/null
+++ b/intern/cycles/util/util_defines.h
@@ -0,0 +1,134 @@
+
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_DEFINES_H__
+#define __UTIL_DEFINES_H__
+
+/* Bitness */
+
+#if defined(__ppc64__) || defined(__PPC64__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64)
+#  define __KERNEL_64_BIT__
+#endif
+
+/* Qualifiers for kernel code shared by CPU and GPU */
+
+#ifndef __KERNEL_GPU__
+#  define ccl_device static inline
+#  define ccl_device_noinline static
+#  define ccl_global
+#  define ccl_constant
+#  define ccl_local
+#  define ccl_local_param
+#  define ccl_private
+#  define ccl_restrict __restrict
+#  define __KERNEL_WITH_SSE_ALIGN__
+
+#  if defined(_WIN32) && !defined(FREE_WINDOWS)
+#    define ccl_device_inline static __forceinline
+#    define ccl_device_forceinline static __forceinline
+#    define ccl_align(...) __declspec(align(__VA_ARGS__))
+#    ifdef __KERNEL_64_BIT__
+#      define ccl_try_align(...) __declspec(align(__VA_ARGS__))
+#    else  /* __KERNEL_64_BIT__ */
+#      undef __KERNEL_WITH_SSE_ALIGN__
+/* No support for function arguments (error C2719). */
+#      define ccl_try_align(...)
+#    endif  /* __KERNEL_64_BIT__ */
+#    define ccl_may_alias
+#    define ccl_always_inline __forceinline
+#    define ccl_never_inline __declspec(noinline)
+#    define ccl_maybe_unused
+#  else  /* _WIN32 && !FREE_WINDOWS */
+#    define ccl_device_inline static inline __attribute__((always_inline))
+#    define ccl_device_forceinline static inline __attribute__((always_inline))
+#    define ccl_align(...) __attribute__((aligned(__VA_ARGS__)))
+#    ifndef FREE_WINDOWS64
+#      define __forceinline inline __attribute__((always_inline))
+#    endif
+#    define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__)))
+#    define ccl_may_alias __attribute__((__may_alias__))
+#    define ccl_always_inline __attribute__((always_inline))
+#    define ccl_never_inline __attribute__((noinline))
+#    define ccl_maybe_unused __attribute__((used))
+#  endif  /* _WIN32 && !FREE_WINDOWS */
+
+/* Use to suppress '-Wimplicit-fallthrough' (in place of 'break'). */
+#  if defined(__GNUC__) && (__GNUC__ >= 7)  /* gcc7.0+ only */
+#    define ATTR_FALLTHROUGH __attribute__((fallthrough))
+#  else
+#    define ATTR_FALLTHROUGH ((void)0)
+#  endif
+#endif  /* __KERNEL_GPU__ */
+
+/* macros */
+
+/* hints for branch prediction, only use in code that runs a _lot_ */
+#if defined(__GNUC__) && defined(__KERNEL_CPU__)
+#  define LIKELY(x)       __builtin_expect(!!(x), 1)
+#  define UNLIKELY(x)     __builtin_expect(!!(x), 0)
+#else
+#  define LIKELY(x)       (x)
+#  define UNLIKELY(x)     (x)
+#endif
+
+#if defined(__cplusplus) && ((__cplusplus >= 201103L) || (defined(_MSC_VER) && _MSC_VER >= 1800))
+#  define HAS_CPP11_FEATURES
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#  if defined(HAS_CPP11_FEATURES)
+/* Some magic to be sure we don't have reference in the type. */
+template<typename T> static inline T decltype_helper(T x) { return x; }
+#    define TYPEOF(x) decltype(decltype_helper(x))
+#  else
+#    define TYPEOF(x) typeof(x)
+#  endif
+#endif
+
+/* Causes warning:
+ * incompatible types when assigning to type 'Foo' from type 'Bar'
+ * ... the compiler optimizes away the temp var */
+#ifdef __GNUC__
+#define CHECK_TYPE(var, type)  {  \
+	TYPEOF(var) *__tmp;           \
+	__tmp = (type *)NULL;         \
+	(void)__tmp;                  \
+} (void)0
+
+#define CHECK_TYPE_PAIR(var_a, var_b)  {  \
+	TYPEOF(var_a) *__tmp;                 \
+	__tmp = (typeof(var_b) *)NULL;        \
+	(void)__tmp;                          \
+} (void)0
+#else
+#  define CHECK_TYPE(var, type)
+#  define CHECK_TYPE_PAIR(var_a, var_b)
+#endif
+
+/* can be used in simple macros */
+#define CHECK_TYPE_INLINE(val, type) \
+	((void)(((type)0) != (val)))
+
+#ifndef __KERNEL_GPU__
+#  include <cassert>
+#  define util_assert(statement)  assert(statement)
+#else
+#  define util_assert(statement)
+#endif
+
+#endif /* __UTIL_DEFINES_H__ */
+
diff --git a/intern/cycles/util/util_optimization.h b/intern/cycles/util/util_optimization.h
index 6f70a474fe7..0382c0811dd 100644
--- a/intern/cycles/util/util_optimization.h
+++ b/intern/cycles/util/util_optimization.h
@@ -19,16 +19,6 @@
 
 #ifndef __KERNEL_GPU__
 
-/* quiet unused define warnings */
-#if defined(__KERNEL_SSE2__)  || \
-	defined(__KERNEL_SSE3__)  || \
-	defined(__KERNEL_SSSE3__) || \
-	defined(__KERNEL_SSE41__) || \
-	defined(__KERNEL_AVX__)   || \
-	defined(__KERNEL_AVX2__)
-	/* do nothing */
-#endif
-
 /* x86
  *
  * Compile a regular, SSE2 and SSE3 kernel. */
@@ -73,48 +63,6 @@
 
 #endif  /* defined(__x86_64__) || defined(_M_X64) */
 
-/* SSE Experiment
- *
- * This is disabled code for an experiment to use SSE types globally for types
- * such as float3 and float4. Currently this gives an overall slowdown. */
-
-#if 0
-#  define __KERNEL_SSE__
-#  ifndef __KERNEL_SSE2__
-#    define __KERNEL_SSE2__
-#  endif
-#  ifndef __KERNEL_SSE3__
-#    define __KERNEL_SSE3__
-#  endif
-#  ifndef __KERNEL_SSSE3__
-#    define __KERNEL_SSSE3__
-#  endif
-#  ifndef __KERNEL_SSE4__
-#    define __KERNEL_SSE4__
-#  endif
-#endif
-
-/* SSE Intrinsics includes
- *
- * We assume __KERNEL_SSEX__ flags to have been defined at this point */
-
-/* SSE intrinsics headers */
-#ifndef FREE_WINDOWS64
-
-#ifdef _MSC_VER
-#  include <intrin.h>
-#elif (defined(__x86_64__) || defined(__i386__))
-#  include <x86intrin.h>
-#endif
-
-#else
-
-/* MinGW64 has conflicting declarations for these SSE headers in <windows.h>.
- * Since we can't avoid including <windows.h>, better only include that */
-#include "util/util_windows.h"
-
-#endif
-
 #endif
 
 #endif /* __UTIL_OPTIMIZATION_H__ */
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index 587febe3e52..7d938a0fbca 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -18,19 +18,38 @@
 #ifndef __UTIL_SIMD_TYPES_H__
 #define __UTIL_SIMD_TYPES_H__
 
+#ifndef __KERNEL_GPU__
+
 #include <limits>
 
 #include "util/util_debug.h"
-#include "util/util_types.h"
+#include "util/util_defines.h"
+
+/* SSE Intrinsics includes
+ *
+ * We assume __KERNEL_SSEX__ flags to have been defined at this point */
+
+/* SSE intrinsics headers */
+#ifndef FREE_WINDOWS64
+
+#ifdef _MSC_VER
+#  include <intrin.h>
+#elif (defined(__x86_64__) || defined(__i386__))
+#  include <x86intrin.h>
+#endif
+
+#else
+
+/* MinGW64 has conflicting declarations for these SSE headers in <windows.h>.
+ * Since we can't avoid including <windows.h>, better only include that */
+#include "util/util_windows.h"
+
+#endif
 
 CCL_NAMESPACE_BEGIN
 
 #ifdef __KERNEL_SSE2__
 
-struct sseb;
-struct ssei;
-struct ssef;
-
 extern const __m128 _mm_lookupmask_ps[16];
 
 /* Special Types */
@@ -496,13 +515,19 @@ ccl_device_inline int bitscan(int value)
 
 #endif /* __KERNEL_SSE2__ */
 
+/* quiet unused define warnings */
+#if defined(__KERNEL_SSE2__)  || \
+	defined(__KERNEL_SSE3__)  || \
+	defined(__KERNEL_SSSE3__) || \
+	defined(__KERNEL_SSE41__) || \
+	defined(__KERNEL_AVX__)   || \
+	defined(__KERNEL_AVX2__)
+	/* do nothing */
+#endif
+
 CCL_NAMESPACE_END
 
-#include "util/util_math.h"
-#include "util/util_sseb.h"
-#include "util/util_ssei.h"
-#include "util/util_ssef.h"
-#include "util/util_avxf.h"
+#endif /* __KERNEL_GPU__ */
 
 #endif /* __UTIL_SIMD_TYPES_H__ */
 
diff --git a/intern/cycles/util/util_sseb.h b/intern/cycles/util/util_sseb.h
index 6e669701f3b..93c22aafdcd 100644
--- a/intern/cycles/util/util_sseb.h
+++ b/intern/cycles/util/util_sseb.h
@@ -22,6 +22,9 @@ CCL_NAMESPACE_BEGIN
 
 #ifdef __KERNEL_SSE2__
 
+struct ssei;
+struct ssef;
+
 /*! 4-wide SSE bool type. */
 struct sseb
 {
diff --git a/intern/cycles/util/util_ssef.h b/intern/cycles/util/util_ssef.h
index cf99a08efae..bb007ff84a9 100644
--- a/intern/cycles/util/util_ssef.h
+++ b/intern/cycles/util/util_ssef.h
@@ -22,6 +22,9 @@ CCL_NAMESPACE_BEGIN
 
 #ifdef __KERNEL_SSE2__
 
+struct sseb;
+struct ssef;
+
 /*! 4-wide SSE float type. */
 struct ssef
 {
diff --git a/intern/cycles/util/util_ssei.h b/intern/cycles/util/util_ssei.h
index 5f62569268c..ef2a9e68b7d 100644
--- a/intern/cycles/util/util_ssei.h
+++ b/intern/cycles/util/util_ssei.h
@@ -22,6 +22,9 @@ CCL_NAMESPACE_BEGIN
 
 #ifdef __KERNEL_SSE2__
 
+struct sseb;
+struct ssef;
+
 /*! 4-wide SSE integer type. */
 struct ssei
 {
@@ -234,8 +237,10 @@ __forceinline size_t select_max(const sseb& valid, const ssei& v) { const ssei a
 
 #else
 
-__forceinline int reduce_min(const ssei& v) { return min(min(v[0],v[1]),min(v[2],v[3])); }
-__forceinline int reduce_max(const ssei& v) { return max(max(v[0],v[1]),max(v[2],v[3])); }
+__forceinline int ssei_min(int a, int b) { return (a < b)? a: b; }
+__forceinline int ssei_max(int a, int b) { return (a > b)? a: b; }
+__forceinline int reduce_min(const ssei& v) { return ssei_min(ssei_min(v[0],v[1]),ssei_min(v[2],v[3])); }
+__forceinline int reduce_max(const ssei& v) { return ssei_max(ssei_max(v[0],v[1]),ssei_max(v[2],v[3])); }
 __forceinline int reduce_add(const ssei& v) { return v[0]+v[1]+v[2]+v[3]; }
 
 #endif
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index a5d1d7152d5..d9642df8005 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -21,72 +21,17 @@
 #  include <stdlib.h>
 #endif
 
-/* Bitness */
-
-#if defined(__ppc64__) || defined(__PPC64__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64)
-#  define __KERNEL_64_BIT__
-#endif
-
-/* Qualifiers for kernel code shared by CPU and GPU */
-
-#ifndef __KERNEL_GPU__
-#  define ccl_device static inline
-#  define ccl_device_noinline static
-#  define ccl_global
-#  define ccl_constant
-#  define ccl_local
-#  define ccl_local_param
-#  define ccl_private
-#  define ccl_restrict __restrict
-#  define __KERNEL_WITH_SSE_ALIGN__
-
-#  if defined(_WIN32) && !defined(FREE_WINDOWS)
-#    define ccl_device_inline static __forceinline
-#    define ccl_device_forceinline static __forceinline
-#    define ccl_align(...) __declspec(align(__VA_ARGS__))
-#    ifdef __KERNEL_64_BIT__
-#      define ccl_try_align(...) __declspec(align(__VA_ARGS__))
-#    else  /* __KERNEL_64_BIT__ */
-#      undef __KERNEL_WITH_SSE_ALIGN__
-/* No support for function arguments (error C2719). */
-#      define ccl_try_align(...)
-#    endif  /* __KERNEL_64_BIT__ */
-#    define ccl_may_alias
-#    define ccl_always_inline __forceinline
-#    define ccl_never_inline __declspec(noinline)
-#    define ccl_maybe_unused
-#  else  /* _WIN32 && !FREE_WINDOWS */
-#    define ccl_device_inline static inline __attribute__((always_inline))
-#    define ccl_device_forceinline static inline __attribute__((always_inline))
-#    define ccl_align(...) __attribute__((aligned(__VA_ARGS__)))
-#    ifndef FREE_WINDOWS64
-#      define __forceinline inline __attribute__((always_inline))
-#    endif
-#    define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__)))
-#    define ccl_may_alias __attribute__((__may_alias__))
-#    define ccl_always_inline __attribute__((always_inline))
-#    define ccl_never_inline __attribute__((noinline))
-#    define ccl_maybe_unused __attribute__((used))
-#  endif  /* _WIN32 && !FREE_WINDOWS */
-
-/* Use to suppress '-Wimplicit-fallthrough' (in place of 'break'). */
-#  if defined(__GNUC__) && (__GNUC__ >= 7)  /* gcc7.0+ only */
-#    define ATTR_FALLTHROUGH __attribute__((fallthrough))
-#  else
-#    define ATTR_FALLTHROUGH ((void)0)
-#  endif
-#endif  /* __KERNEL_GPU__ */
-
 /* Standard Integer Types */
 
+#if !defined(__KERNEL_GPU__) && !defined(_WIN32)
+#  include <stdint.h>
+#endif
+
+#include "util/util_defines.h"
+
 #ifndef __KERNEL_GPU__
-/* int8_t, uint16_t, and friends */
-#  ifndef _WIN32
-#    include <stdint.h>
-#  endif
-/* SIMD Types */
-#  include "util/util_optimization.h"
-#endif  /* __KERNEL_GPU__ */
+#  include "util/util_simd.h"
+#endif
 
 CCL_NAMESPACE_BEGIN
 
@@ -201,65 +146,8 @@ enum ExtensionType {
 	EXTENSION_NUM_TYPES,
 };
 
-/* macros */
-
-/* hints for branch prediction, only use in code that runs a _lot_ */
-#if defined(__GNUC__) && defined(__KERNEL_CPU__)
-#  define LIKELY(x)       __builtin_expect(!!(x), 1)
-#  define UNLIKELY(x)     __builtin_expect(!!(x), 0)
-#else
-#  define LIKELY(x)       (x)
-#  define UNLIKELY(x)     (x)
-#endif
-
-#if defined(__cplusplus) && ((__cplusplus >= 201103L) || (defined(_MSC_VER) && _MSC_VER >= 1800))
-#  define HAS_CPP11_FEATURES
-#endif
-
-#if defined(__GNUC__) || defined(__clang__)
-#  if defined(HAS_CPP11_FEATURES)
-/* Some magic to be sure we don't have reference in the type. */
-template<typename T> static inline T decltype_helper(T x) { return x; }
-#    define TYPEOF(x) decltype(decltype_helper(x))
-#  else
-#    define TYPEOF(x) typeof(x)
-#  endif
-#endif
-
-/* Causes warning:
- * incompatible types when assigning to type 'Foo' from type 'Bar'
- * ... the compiler optimizes away the temp var */
-#ifdef __GNUC__
-#define CHECK_TYPE(var, type)  {  \
-	TYPEOF(var) *__tmp;           \
-	__tmp = (type *)NULL;         \
-	(void)__tmp;                  \
-} (void)0
-
-#define CHECK_TYPE_PAIR(var_a, var_b)  {  \
-	TYPEOF(var_a) *__tmp;                 \
-	__tmp = (typeof(var_b) *)NULL;        \
-	(void)__tmp;                          \
-} (void)0
-#else
-#  define CHECK_TYPE(var, type)
-#  define CHECK_TYPE_PAIR(var_a, var_b)
-#endif
-
-/* can be used in simple macros */
-#define CHECK_TYPE_INLINE(val, type) \
-	((void)(((type)0) != (val)))
-
-
 CCL_NAMESPACE_END
 
-#ifndef __KERNEL_GPU__
-#  include <cassert>
-#  define util_assert(statement)  assert(statement)
-#else
-#  define util_assert(statement)
-#endif
-
 /* Vectorized types declaration. */
 #include "util/util_types_uchar2.h"
 #include "util/util_types_uchar3.h"
@@ -298,5 +186,13 @@ CCL_NAMESPACE_END
 
 #include "util/util_types_vector3_impl.h"
 
+/* SSE types. */
+#ifndef __KERNEL_GPU__
+#  include "util/util_sseb.h"
+#  include "util/util_ssei.h"
+#  include "util/util_ssef.h"
+#  include "util/util_avxf.h"
+#endif
+
 #endif /* __UTIL_TYPES_H__ */
 

From a24fbf3323101cd35332161b12a04e687b5583e4 Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brechtvanlommel@gmail.com>
Date: Wed, 2 Aug 2017 02:23:03 +0200
Subject: [PATCH 07/11] Code refactor: add, remove, optimize various SSE
 functions.

* Remove some unnecessary SSE emulation defines.
* Use full precision float division so we can enable it.
* Add sqrt(), sqr(), fabs(), shuffle variations, mask().
* Optimize reduce_add(), select().

Differential Revision: https://developer.blender.org/D2764
---
 .../kernel/filter/filter_features_sse.h       | 12 +--
 intern/cycles/kernel/geom/geom_object.h       |  5 -
 intern/cycles/util/util_math.h                |  1 +
 intern/cycles/util/util_math_float3.h         | 20 ++--
 intern/cycles/util/util_math_float4.h         | 93 +++++++++++++++----
 intern/cycles/util/util_simd.h                | 52 +----------
 6 files changed, 92 insertions(+), 91 deletions(-)

diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h
index 3185330994c..27e220923a0 100644
--- a/intern/cycles/kernel/filter/filter_features_sse.h
+++ b/intern/cycles/kernel/filter/filter_features_sse.h
@@ -92,13 +92,13 @@ ccl_device_inline void filter_get_feature_scales_sse(__m128 x, __m128 y,
 
 ccl_device_inline void filter_calculate_scale_sse(__m128 *scale)
 {
-	scale[0] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[0]), _mm_set1_ps(0.01f)));
-	scale[1] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[1]), _mm_set1_ps(0.01f)));
-	scale[2] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[2]), _mm_set1_ps(0.01f)));
-	scale[6] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[4]), _mm_set1_ps(0.01f)));
+	scale[0] = _mm_div_ps(_mm_set1_ps(1.0f), _mm_max_ps(_mm_hmax_ps(scale[0]), _mm_set1_ps(0.01f)));
+	scale[1] = _mm_div_ps(_mm_set1_ps(1.0f), _mm_max_ps(_mm_hmax_ps(scale[1]), _mm_set1_ps(0.01f)));
+	scale[2] = _mm_div_ps(_mm_set1_ps(1.0f), _mm_max_ps(_mm_hmax_ps(scale[2]), _mm_set1_ps(0.01f)));
+	scale[6] = _mm_div_ps(_mm_set1_ps(1.0f), _mm_max_ps(_mm_hmax_ps(scale[4]), _mm_set1_ps(0.01f)));
 
-	scale[7] = scale[8] = scale[9] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[5])), _mm_set1_ps(0.01f)));
-	scale[3] = scale[4] = scale[5] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[3])), _mm_set1_ps(0.01f)));
+	scale[7] = scale[8] = scale[9] = _mm_div_ps(_mm_set1_ps(1.0f), _mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[5])), _mm_set1_ps(0.01f)));
+	scale[3] = scale[4] = scale[5] = _mm_div_ps(_mm_set1_ps(1.0f), _mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[3])), _mm_set1_ps(0.01f)));
 }
 
 
diff --git a/intern/cycles/kernel/geom/geom_object.h b/intern/cycles/kernel/geom/geom_object.h
index 6ecdfe0173a..1ffc143be34 100644
--- a/intern/cycles/kernel/geom/geom_object.h
+++ b/intern/cycles/kernel/geom/geom_object.h
@@ -415,12 +415,7 @@ ccl_device_inline float3 bvh_clamp_direction(float3 dir)
 
 ccl_device_inline float3 bvh_inverse_direction(float3 dir)
 {
-	/* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */
-#if defined(__KERNEL_SSE__) && 0
 	return rcp(dir);
-#else
-	return 1.0f / dir;
-#endif
 }
 
 /* Transform ray into object space to enter static object in BVH */
diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h
index b719640b19c..4d51ec5570a 100644
--- a/intern/cycles/util/util_math.h
+++ b/intern/cycles/util/util_math.h
@@ -94,6 +94,7 @@ ccl_device_inline float fminf(float a, float b)
 #ifndef __KERNEL_GPU__
 using std::isfinite;
 using std::isnan;
+using std::sqrt;
 
 ccl_device_inline int abs(int x)
 {
diff --git a/intern/cycles/util/util_math_float3.h b/intern/cycles/util/util_math_float3.h
index bb04c4aa2d9..e73e5bc17a2 100644
--- a/intern/cycles/util/util_math_float3.h
+++ b/intern/cycles/util/util_math_float3.h
@@ -108,8 +108,7 @@ ccl_device_inline float3 operator*(const float3& a, const float f)
 
 ccl_device_inline float3 operator*(const float f, const float3& a)
 {
-	/* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */
-#if defined(__KERNEL_SSE__) && 0
+#if defined(__KERNEL_SSE__)
 	return float3(_mm_mul_ps(_mm_set1_ps(f), a.m128));
 #else
 	return make_float3(a.x*f, a.y*f, a.z*f);
@@ -118,10 +117,8 @@ ccl_device_inline float3 operator*(const float f, const float3& a)
 
 ccl_device_inline float3 operator/(const float f, const float3& a)
 {
-	/* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */
-#if defined(__KERNEL_SSE__) && 0
-	__m128 rc = _mm_rcp_ps(a.m128);
-	return float3(_mm_mul_ps(_mm_set1_ps(f),rc));
+#if defined(__KERNEL_SSE__)
+	return float3(_mm_div_ps(_mm_set1_ps(f), a.m128));
 #else
 	return make_float3(f / a.x, f / a.y, f / a.z);
 #endif
@@ -135,10 +132,8 @@ ccl_device_inline float3 operator/(const float3& a, const float f)
 
 ccl_device_inline float3 operator/(const float3& a, const float3& b)
 {
-	/* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */
-#if defined(__KERNEL_SSE__) && 0
-	__m128 rc = _mm_rcp_ps(b.m128);
-	return float3(_mm_mul_ps(a, rc));
+#if defined(__KERNEL_SSE__)
+	return float3(_mm_div_ps(a.m128, b.m128));
 #else
 	return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
 #endif
@@ -282,9 +277,8 @@ ccl_device_inline float3 mix(const float3& a, const float3& b, float t)
 ccl_device_inline float3 rcp(const float3& a)
 {
 #ifdef __KERNEL_SSE__
-	const float4 r(_mm_rcp_ps(a.m128));
-	return float3(_mm_sub_ps(_mm_add_ps(r, r),
-	                         _mm_mul_ps(_mm_mul_ps(r, r), a)));
+	/* Don't use _mm_rcp_ps due to poor precision. */
+	return float3(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
 #else
 	return make_float3(1.0f/a.x, 1.0f/a.y, 1.0f/a.z);
 #endif
diff --git a/intern/cycles/util/util_math_float4.h b/intern/cycles/util/util_math_float4.h
index d89121b3a1d..007b3fc5082 100644
--- a/intern/cycles/util/util_math_float4.h
+++ b/intern/cycles/util/util_math_float4.h
@@ -48,6 +48,8 @@ ccl_device_inline bool operator==(const float4& a, const float4& b);
 ccl_device_inline float dot(const float4& a, const float4& b);
 ccl_device_inline float len_squared(const float4& a);
 ccl_device_inline float4 rcp(const float4& a);
+ccl_device_inline float4 sqrt(const float4& a);
+ccl_device_inline float4 sqr(const float4& a);
 ccl_device_inline float4 cross(const float4& a, const float4& b);
 ccl_device_inline bool is_zero(const float4& a);
 ccl_device_inline float reduce_add(const float4& a);
@@ -57,14 +59,20 @@ ccl_device_inline float4 normalize(const float4& a);
 ccl_device_inline float4 safe_normalize(const float4& a);
 ccl_device_inline float4 min(const float4& a, const float4& b);
 ccl_device_inline float4 max(const float4& a, const float4& b);
+ccl_device_inline float4 fabs(const float4& a);
 #endif  /* !__KERNEL_OPENCL__*/
 
 #ifdef __KERNEL_SSE__
 template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
 __forceinline const float4 shuffle(const float4& b);
+template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
+__forceinline const float4 shuffle(const float4& a, const float4& b);
 
 template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b);
 
+template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& a, const float4& b);
+template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4& a, const float4& b);
+
 #  ifdef __KERNEL_SSE3__
 template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b);
 template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4& b);
@@ -77,9 +85,6 @@ ccl_device_inline float4 select(const int4& mask,
                                 const float4& b);
 ccl_device_inline float4 reduce_min(const float4& a);
 ccl_device_inline float4 reduce_max(const float4& a);
-#  if 0
-ccl_device_inline float4 reduce_add(const float4& a);
-#  endif
 #endif  /* !__KERNEL_GPU__ */
 
 /*******************************************************************************
@@ -128,7 +133,7 @@ ccl_device_inline float4 operator/(const float4& a, float f)
 ccl_device_inline float4 operator/(const float4& a, const float4& b)
 {
 #ifdef __KERNEL_SSE__
-	return a * rcp(b);
+	return float4(_mm_div_ps(a.m128, b.m128));
 #else
 	return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
 #endif
@@ -224,14 +229,30 @@ ccl_device_inline float len_squared(const float4& a)
 ccl_device_inline float4 rcp(const float4& a)
 {
 #ifdef __KERNEL_SSE__
-	float4 r(_mm_rcp_ps(a.m128));
-	return float4(_mm_sub_ps(_mm_add_ps(r, r),
-	                         _mm_mul_ps(_mm_mul_ps(r, r), a)));
+	/* Don't use _mm_rcp_ps due to poor precision. */
+	return float4(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
 #else
 	return make_float4(1.0f/a.x, 1.0f/a.y, 1.0f/a.z, 1.0f/a.w);
 #endif
 }
 
+ccl_device_inline float4 sqrt(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+	return float4(_mm_sqrt_ps(a.m128));
+#else
+	return make_float4(sqrtf(a.x),
+	                   sqrtf(a.y),
+	                   sqrtf(a.z),
+	                   sqrtf(a.w));
+#endif
+}
+
+ccl_device_inline float4 sqr(const float4& a)
+{
+	return a * a;
+}
+
 ccl_device_inline float4 cross(const float4& a, const float4& b)
 {
 #ifdef __KERNEL_SSE__
@@ -257,9 +278,13 @@ ccl_device_inline bool is_zero(const float4& a)
 ccl_device_inline float reduce_add(const float4& a)
 {
 #ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE3__
+    float4 h(_mm_hadd_ps(a.m128, a.m128));
+    return  _mm_cvtss_f32(_mm_hadd_ps(h.m128, h.m128));
+#  else
 	float4 h(shuffle<1,0,3,2>(a) + a);
-	/* TODO(sergey): Investigate efficiency. */
-	return _mm_cvtss_f32(shuffle<2,3,0,1>(h) + h);
+	return  _mm_cvtss_f32(shuffle<2,3,0,1>(h) + h);
+#  endif
 #else
 	return ((a.x + a.y) + (a.z + a.w));
 #endif
@@ -309,6 +334,18 @@ ccl_device_inline float4 max(const float4& a, const float4& b)
 	                   max(a.w, b.w));
 #endif
 }
+
+ccl_device_inline float4 fabs(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+	return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
+#else
+	return make_float4(fabsf(a.x),
+	                   fabsf(a.y),
+	                   fabsf(a.z),
+	                   fabsf(a.w));
+#endif
+}
 #endif  /* !__KERNEL_OPENCL__*/
 
 #ifdef __KERNEL_SSE__
@@ -320,11 +357,28 @@ __forceinline const float4 shuffle(const float4& b)
 	                          _MM_SHUFFLE(index_3, index_2, index_1, index_0))));
 }
 
+template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
+__forceinline const float4 shuffle(const float4& a, const float4& b)
+{
+	return float4(_mm_shuffle_ps(a.m128, b.m128,
+	                             _MM_SHUFFLE(index_3, index_2, index_1, index_0)));
+}
+
 template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b)
 {
 	return float4(_mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(b))));
 }
 
+template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& a, const float4& b)
+{
+	return float4(_mm_movelh_ps(a.m128, b.m128));
+}
+
+template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4& a, const float4& b)
+{
+	return float4(_mm_movehl_ps(b.m128, a.m128));
+}
+
 #  ifdef __KERNEL_SSE3__
 template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b)
 {
@@ -344,9 +398,7 @@ ccl_device_inline float4 select(const int4& mask,
                                 const float4& b)
 {
 #ifdef __KERNEL_SSE__
-	/* TODO(sergey): avoid cvt. */
-	return float4(_mm_or_ps(_mm_and_ps(_mm_cvtepi32_ps(mask), a),
-	                        _mm_andnot_ps(_mm_cvtepi32_ps(mask), b)));
+	return float4(_mm_blendv_ps(b.m128, a.m128, _mm_castsi128_ps(mask.m128)));
 #else
 	return make_float4((mask.x)? a.x: b.x,
 	                   (mask.y)? a.y: b.y,
@@ -355,6 +407,13 @@ ccl_device_inline float4 select(const int4& mask,
 #endif
 }
 
+ccl_device_inline float4 mask(const int4& mask,
+                              const float4& a)
+{
+	/* Replace elements of x with zero where mask isn't set. */
+	return select(mask, a, make_float4(0.0f));
+}
+
 ccl_device_inline float4 reduce_min(const float4& a)
 {
 #ifdef __KERNEL_SSE__
@@ -375,17 +434,15 @@ ccl_device_inline float4 reduce_max(const float4& a)
 #endif
 }
 
-#if 0
-ccl_device_inline float4 reduce_add(const float4& a)
+ccl_device_inline float4 load_float4(const float *v)
 {
 #ifdef __KERNEL_SSE__
-	float4 h = shuffle<1,0,3,2>(a) + a;
-	return shuffle<2,3,0,1>(h) + h;
+	return float4(_mm_loadu_ps(v));
 #else
-	return make_float4((a.x + a.y) + (a.z + a.w));
+	return make_float4(v[0], v[1], v[2], v[3]);
 #endif
 }
-#endif
+
 #endif  /* !__KERNEL_GPU__ */
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index 7d938a0fbca..66dd80420ae 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -347,12 +347,9 @@ __forceinline size_t __bscf(size_t& v)
 
 #endif /* _WIN32 */
 
-static const unsigned int BITSCAN_NO_BIT_SET_32 = 32;
-static const size_t       BITSCAN_NO_BIT_SET_64 = 64;
+#if !(defined(__SSE4_1__) || defined(__SSE4_2__))
 
-#ifdef __KERNEL_SSE3__
-/* Emulation of SSE4 functions with SSE3 */
-#  ifndef __KERNEL_SSE41__
+/* Emulation of SSE4 functions with SSE2 */
 
 #define _MM_FROUND_TO_NEAREST_INT    0x00
 #define _MM_FROUND_TO_NEG_INF        0x01
@@ -361,48 +358,31 @@ static const size_t       BITSCAN_NO_BIT_SET_64 = 64;
 #define _MM_FROUND_CUR_DIRECTION     0x04
 
 #undef _mm_blendv_ps
-#define _mm_blendv_ps __emu_mm_blendv_ps
 __forceinline __m128 _mm_blendv_ps( __m128 value, __m128 input, __m128 mask ) { 
     return _mm_or_ps(_mm_and_ps(mask, input), _mm_andnot_ps(mask, value)); 
 }
 
 #undef _mm_blend_ps
-#define _mm_blend_ps __emu_mm_blend_ps
 __forceinline __m128 _mm_blend_ps( __m128 value, __m128 input, const int mask ) { 
     assert(mask < 0x10); return _mm_blendv_ps(value, input, _mm_lookupmask_ps[mask]); 
 }
 
 #undef _mm_blendv_epi8
-#define _mm_blendv_epi8 __emu_mm_blendv_epi8
 __forceinline __m128i _mm_blendv_epi8( __m128i value, __m128i input, __m128i mask ) { 
     return _mm_or_si128(_mm_and_si128(mask, input), _mm_andnot_si128(mask, value)); 
 }
 
-#undef _mm_mullo_epi32
-#define _mm_mullo_epi32 __emu_mm_mullo_epi32
-__forceinline __m128i _mm_mullo_epi32( __m128i value, __m128i input ) {
-  __m128i rvalue;
-  char* _r = (char*)(&rvalue + 1);
-  char* _v = (char*)(& value + 1);
-  char* _i = (char*)(& input + 1);
-  for( ssize_t i = -16 ; i != 0 ; i += 4 ) *((int32_t*)(_r + i)) = *((int32_t*)(_v + i))*  *((int32_t*)(_i + i));
-  return rvalue;
-}
-
 #undef _mm_min_epi32
-#define _mm_min_epi32 __emu_mm_min_epi32
 __forceinline __m128i _mm_min_epi32( __m128i value, __m128i input ) { 
     return _mm_blendv_epi8(input, value, _mm_cmplt_epi32(value, input)); 
 }
 
 #undef _mm_max_epi32
-#define _mm_max_epi32 __emu_mm_max_epi32
 __forceinline __m128i _mm_max_epi32( __m128i value, __m128i input ) { 
     return _mm_blendv_epi8(value, input, _mm_cmplt_epi32(value, input)); 
 }
 
 #undef _mm_extract_epi32
-#define _mm_extract_epi32 __emu_mm_extract_epi32
 __forceinline int _mm_extract_epi32( __m128i input, const int index ) {
   switch ( index ) {
   case 0: return _mm_cvtsi128_si32(input);
@@ -414,24 +394,15 @@ __forceinline int _mm_extract_epi32( __m128i input, const int index ) {
 }
 
 #undef _mm_insert_epi32
-#define _mm_insert_epi32 __emu_mm_insert_epi32
 __forceinline __m128i _mm_insert_epi32( __m128i value, int input, const int index ) { 
     assert(index >= 0 && index < 4); ((int*)&value)[index] = input; return value; 
 }
 
-#undef _mm_extract_ps
-#define _mm_extract_ps __emu_mm_extract_ps
-__forceinline int _mm_extract_ps( __m128 input, const int index ) {
-  int32_t* ptr = (int32_t*)&input; return ptr[index];
-}
-
 #undef _mm_insert_ps
-#define _mm_insert_ps __emu_mm_insert_ps
 __forceinline __m128 _mm_insert_ps( __m128 value, __m128 input, const int index )
 { assert(index < 0x100); ((float*)&value)[(index >> 4)&0x3] = ((float*)&input)[index >> 6]; return _mm_andnot_ps(_mm_lookupmask_ps[index&0xf], value); }
 
 #undef _mm_round_ps
-#define _mm_round_ps __emu_mm_round_ps
 __forceinline __m128 _mm_round_ps( __m128 value, const int flags )
 {
   switch ( flags )
@@ -444,22 +415,7 @@ __forceinline __m128 _mm_round_ps( __m128 value, const int flags )
   return value;
 }
 
-#    ifdef _M_X64
-#undef _mm_insert_epi64
-#define _mm_insert_epi64 __emu_mm_insert_epi64
-__forceinline __m128i _mm_insert_epi64( __m128i value, __int64 input, const int index ) { 
-    assert(size_t(index) < 4); ((__int64*)&value)[index] = input; return value; 
-}
-
-#undef _mm_extract_epi64
-#define _mm_extract_epi64 __emu_mm_extract_epi64
-__forceinline __int64 _mm_extract_epi64( __m128i input, const int index ) { 
-    assert(size_t(index) < 2); 
-    return index == 0 ? _mm_cvtsi128_si64x(input) : _mm_cvtsi128_si64x(_mm_unpackhi_epi64(input, input)); 
-}
-#    endif
-
-#  endif
+#endif /* !(defined(__SSE4_1__) || defined(__SSE4_2__)) */
 
 #undef _mm_fabs_ps
 #define _mm_fabs_ps(x) _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))
@@ -494,8 +450,6 @@ ccl_device_inline __m128 _mm_hsum_ps(__m128 x)
 #undef _mm_mask_ps
 #define _mm_mask_ps(x, mask) _mm_blendv_ps(_mm_setzero_ps(), x, mask)
 
-#endif
-
 #else  /* __KERNEL_SSE2__ */
 
 /* This section is for utility functions which operates on non-register data

From ee77c1e917dd5e9d8eaca6212376a11aef1a877d Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brechtvanlommel@gmail.com>
Date: Wed, 19 Jul 2017 01:54:56 +0200
Subject: [PATCH 08/11] Code refactor: use float4 instead of intrinsics for CPU
 denoise filtering.

Differential Revision: https://developer.blender.org/D2764
---
 .../kernel/filter/filter_features_sse.h       | 80 ++++++++-----------
 intern/cycles/kernel/filter/filter_nlm_cpu.h  | 18 ++---
 .../kernel/filter/filter_transform_sse.h      | 16 ++--
 .../cycles/kernel/kernels/cpu/filter_sse2.cpp |  1 +
 .../cycles/kernel/kernels/cpu/filter_sse3.cpp |  1 +
 .../kernel/kernels/cpu/filter_sse41.cpp       |  1 +
 intern/cycles/util/util_math_float4.h         | 12 +--
 intern/cycles/util/util_math_matrix.h         | 28 +++----
 intern/cycles/util/util_simd.h                | 33 --------
 9 files changed, 71 insertions(+), 119 deletions(-)

diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h
index 27e220923a0..3ddd8712266 100644
--- a/intern/cycles/kernel/filter/filter_features_sse.h
+++ b/intern/cycles/kernel/filter/filter_features_sse.h
@@ -16,7 +16,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-#define ccl_get_feature_sse(pass) _mm_loadu_ps(buffer + (pass)*pass_stride)
+#define ccl_get_feature_sse(pass) load_float4(buffer + (pass)*pass_stride)
 
 /* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time.
  * pixel_buffer always points to the first of the 4 current pixel in the first pass.
@@ -24,25 +24,25 @@ CCL_NAMESPACE_BEGIN
 
 #define FOR_PIXEL_WINDOW_SSE     pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
                                  for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
-                                     __m128 y4 = _mm_set1_ps(pixel.y); \
+                                     float4 y4 = make_float4(pixel.y); \
                                      for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \
-                                         __m128 x4 = _mm_add_ps(_mm_set1_ps(pixel.x), _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); \
-                                         __m128 active_pixels = _mm_cmplt_ps(x4, _mm_set1_ps(high.x));
+                                         float4 x4 = make_float4(pixel.x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); \
+                                         int4 active_pixels = x4 < make_float4(high.x);
 
 #define END_FOR_PIXEL_WINDOW_SSE     } \
                                      pixel_buffer += buffer_w - (pixel.x - low.x); \
                                  }
 
-ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y,
-                                               __m128 active_pixels,
+ccl_device_inline void filter_get_features_sse(float4 x, float4 y,
+                                               int4 active_pixels,
                                                const float *ccl_restrict buffer,
-                                               __m128 *features,
-                                               const __m128 *ccl_restrict mean,
+                                               float4 *features,
+                                               const float4 *ccl_restrict mean,
                                                int pass_stride)
 {
 	features[0] = x;
 	features[1] = y;
-	features[2] = _mm_fabs_ps(ccl_get_feature_sse(0));
+	features[2] = fabs(ccl_get_feature_sse(0));
 	features[3] = ccl_get_feature_sse(1);
 	features[4] = ccl_get_feature_sse(2);
 	features[5] = ccl_get_feature_sse(3);
@@ -52,53 +52,41 @@ ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y,
 	features[9] = ccl_get_feature_sse(7);
 	if(mean) {
 		for(int i = 0; i < DENOISE_FEATURES; i++)
-			features[i] = _mm_sub_ps(features[i], mean[i]);
+			features[i] = features[i] - mean[i];
 	}
 	for(int i = 0; i < DENOISE_FEATURES; i++)
-		features[i] = _mm_mask_ps(features[i], active_pixels);
+		features[i] = mask(active_pixels, features[i]);
 }
 
-ccl_device_inline void filter_get_feature_scales_sse(__m128 x, __m128 y,
-                                                     __m128 active_pixels,
+ccl_device_inline void filter_get_feature_scales_sse(float4 x, float4 y,
+                                                     int4 active_pixels,
                                                      const float *ccl_restrict buffer,
-                                                     __m128 *scales,
-                                                     const __m128 *ccl_restrict mean,
+                                                     float4 *scales,
+                                                     const float4 *ccl_restrict mean,
                                                      int pass_stride)
 {
-	scales[0] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(x, mean[0])), active_pixels);
-	scales[1] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(y, mean[1])), active_pixels);
-
-	scales[2] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(_mm_fabs_ps(ccl_get_feature_sse(0)), mean[2])), active_pixels);
-
-	__m128 diff, scale;
-	diff = _mm_sub_ps(ccl_get_feature_sse(1), mean[3]);
-	scale = _mm_mul_ps(diff, diff);
-	diff = _mm_sub_ps(ccl_get_feature_sse(2), mean[4]);
-	scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
-	diff = _mm_sub_ps(ccl_get_feature_sse(3), mean[5]);
-	scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
-	scales[3] = _mm_mask_ps(scale, active_pixels);
-
-	scales[4] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(4), mean[6])), active_pixels);
-
-	diff = _mm_sub_ps(ccl_get_feature_sse(5), mean[7]);
-	scale = _mm_mul_ps(diff, diff);
-	diff = _mm_sub_ps(ccl_get_feature_sse(6), mean[8]);
-	scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
-	diff = _mm_sub_ps(ccl_get_feature_sse(7), mean[9]);
-	scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
-	scales[5] = _mm_mask_ps(scale, active_pixels);
+	scales[0] = fabs(x - mean[0]);
+	scales[1] = fabs(y - mean[1]);
+	scales[2] = fabs(fabs(ccl_get_feature_sse(0)) - mean[2]);
+	scales[3] = sqr(ccl_get_feature_sse(1) - mean[3]) +
+	            sqr(ccl_get_feature_sse(2) - mean[4]) +
+	            sqr(ccl_get_feature_sse(3) - mean[5]);
+	scales[4] = fabs(ccl_get_feature_sse(4) - mean[6]);
+	scales[5] = sqr(ccl_get_feature_sse(5) - mean[7]) +
+	            sqr(ccl_get_feature_sse(6) - mean[8]) +
+	            sqr(ccl_get_feature_sse(7) - mean[9]);
+	for(int i = 0; i < 6; i++)
+		scales[i] = mask(active_pixels, scales[i]);
 }
 
-ccl_device_inline void filter_calculate_scale_sse(__m128 *scale)
+ccl_device_inline void filter_calculate_scale_sse(float4 *scale)
 {
-	scale[0] = _mm_div_ps(_mm_set1_ps(1.0f), _mm_max_ps(_mm_hmax_ps(scale[0]), _mm_set1_ps(0.01f)));
-	scale[1] = _mm_div_ps(_mm_set1_ps(1.0f), _mm_max_ps(_mm_hmax_ps(scale[1]), _mm_set1_ps(0.01f)));
-	scale[2] = _mm_div_ps(_mm_set1_ps(1.0f), _mm_max_ps(_mm_hmax_ps(scale[2]), _mm_set1_ps(0.01f)));
-	scale[6] = _mm_div_ps(_mm_set1_ps(1.0f), _mm_max_ps(_mm_hmax_ps(scale[4]), _mm_set1_ps(0.01f)));
-
-	scale[7] = scale[8] = scale[9] = _mm_div_ps(_mm_set1_ps(1.0f), _mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[5])), _mm_set1_ps(0.01f)));
-	scale[3] = scale[4] = scale[5] = _mm_div_ps(_mm_set1_ps(1.0f), _mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[3])), _mm_set1_ps(0.01f)));
+	scale[0] = rcp(max(reduce_max(scale[0]), make_float4(0.01f)));
+	scale[1] = rcp(max(reduce_max(scale[1]), make_float4(0.01f)));
+	scale[2] = rcp(max(reduce_max(scale[2]), make_float4(0.01f)));
+	scale[6] = rcp(max(reduce_max(scale[4]), make_float4(0.01f)));
+	scale[7] = scale[8] = scale[9] = rcp(max(reduce_max(sqrt(scale[5])), make_float4(0.01f)));
+	scale[3] = scale[4] = scale[5] = rcp(max(reduce_max(sqrt(scale[3])), make_float4(0.01f)));
 }
 
 
diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h
index 3e752bce68f..5e989331bc2 100644
--- a/intern/cycles/kernel/filter/filter_nlm_cpu.h
+++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h
@@ -50,10 +50,8 @@ ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict differen
                                               int w,
                                               int f)
 {
-#ifdef __KERNEL_SSE3__
-	int aligned_lowx = (rect.x & ~(3));
-	int aligned_highx = ((rect.z + 3) & ~(3));
-#endif
+	int aligned_lowx = rect.x / 4;
+	int aligned_highx = (rect.z + 3) / 4;
 	for(int y = rect.y; y < rect.w; y++) {
 		const int low = max(rect.y, y-f);
 		const int high = min(rect.w, y+f+1);
@@ -61,15 +59,11 @@ ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict differen
 			out_image[y*w+x] = 0.0f;
 		}
 		for(int y1 = low; y1 < high; y1++) {
-#ifdef __KERNEL_SSE3__
-			for(int x = aligned_lowx; x < aligned_highx; x+=4) {
-				_mm_store_ps(out_image + y*w+x, _mm_add_ps(_mm_load_ps(out_image + y*w+x), _mm_load_ps(difference_image + y1*w+x)));
+			float4* out_image4 = (float4*)(out_image + y*w);
+			float4* difference_image4 = (float4*)(difference_image + y1*w);
+			for(int x = aligned_lowx; x < aligned_highx; x++) {
+				out_image4[x] += difference_image4[x];
 			}
-#else
-			for(int x = rect.x; x < rect.z; x++) {
-				out_image[y*w+x] += difference_image[y1*w+x];
-			}
-#endif
 		}
 		for(int x = rect.x; x < rect.z; x++) {
 			out_image[y*w+x] *= 1.0f/(high - low);
diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h
index 30dc2969b11..9e65f61664b 100644
--- a/intern/cycles/kernel/filter/filter_transform_sse.h
+++ b/intern/cycles/kernel/filter/filter_transform_sse.h
@@ -24,7 +24,7 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
 {
 	int buffer_w = align_up(rect.z - rect.x, 4);
 
-	__m128 features[DENOISE_FEATURES];
+	float4 features[DENOISE_FEATURES];
 	const float *ccl_restrict pixel_buffer;
 	int2 pixel;
 
@@ -34,19 +34,19 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
 	                      min(rect.w, y + radius + 1));
 	int num_pixels = (high.y - low.y) * (high.x - low.x);
 
-	__m128 feature_means[DENOISE_FEATURES];
+	float4 feature_means[DENOISE_FEATURES];
 	math_vector_zero_sse(feature_means, DENOISE_FEATURES);
 	FOR_PIXEL_WINDOW_SSE {
 		filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, NULL, pass_stride);
 		math_vector_add_sse(feature_means, DENOISE_FEATURES, features);
 	} END_FOR_PIXEL_WINDOW_SSE
 
-	__m128 pixel_scale = _mm_set1_ps(1.0f / num_pixels);
+	float4 pixel_scale = make_float4(1.0f / num_pixels);
 	for(int i = 0; i < DENOISE_FEATURES; i++) {
-		feature_means[i] = _mm_mul_ps(_mm_hsum_ps(feature_means[i]), pixel_scale);
+		feature_means[i] = reduce_add(feature_means[i]) * pixel_scale;
 	}
 
-	__m128 feature_scale[DENOISE_FEATURES];
+	float4 feature_scale[DENOISE_FEATURES];
 	math_vector_zero_sse(feature_scale, DENOISE_FEATURES);
 	FOR_PIXEL_WINDOW_SSE {
 		filter_get_feature_scales_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
@@ -55,12 +55,12 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
 
 	filter_calculate_scale_sse(feature_scale);
 
-	__m128 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES];
+	float4 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES];
 	math_matrix_zero_sse(feature_matrix_sse, DENOISE_FEATURES);
 	FOR_PIXEL_WINDOW_SSE {
 		filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
 		math_vector_mul_sse(features, DENOISE_FEATURES, feature_scale);
-		math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, _mm_set1_ps(1.0f));
+		math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, make_float4(1.0f));
 	} END_FOR_PIXEL_WINDOW_SSE
 
 	float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
@@ -98,7 +98,7 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
 
 	/* Bake the feature scaling into the transformation matrix. */
 	for(int i = 0; i < DENOISE_FEATURES; i++) {
-		math_vector_scale(transform + i*DENOISE_FEATURES, _mm_cvtss_f32(feature_scale[i]), *rank);
+		math_vector_scale(transform + i*DENOISE_FEATURES, feature_scale[i][0], *rank);
 	}
 }
 
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp
index f7c9935f1d0..a13fb5cd4fb 100644
--- a/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp
@@ -25,6 +25,7 @@
 #else
 /* SSE optimization disabled for now on 32 bit, see bug #36316 */
 #  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
 #    define __KERNEL_SSE2__
 #  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp
index 070b95a3505..6b690adf0f5 100644
--- a/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp
+++ b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp
@@ -25,6 +25,7 @@
 #else
 /* SSE optimization disabled for now on 32 bit, see bug #36316 */
 #  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
 #    define __KERNEL_SSE2__
 #    define __KERNEL_SSE3__
 #    define __KERNEL_SSSE3__
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
index 1a7b2040da1..254025be4e2 100644
--- a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
+++ b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
@@ -25,6 +25,7 @@
 #else
 /* SSE optimization disabled for now on 32 bit, see bug #36316 */
 #  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
 #    define __KERNEL_SSE2__
 #    define __KERNEL_SSE3__
 #    define __KERNEL_SSSE3__
diff --git a/intern/cycles/util/util_math_float4.h b/intern/cycles/util/util_math_float4.h
index 007b3fc5082..adb9a76a434 100644
--- a/intern/cycles/util/util_math_float4.h
+++ b/intern/cycles/util/util_math_float4.h
@@ -52,7 +52,6 @@ ccl_device_inline float4 sqrt(const float4& a);
 ccl_device_inline float4 sqr(const float4& a);
 ccl_device_inline float4 cross(const float4& a, const float4& b);
 ccl_device_inline bool is_zero(const float4& a);
-ccl_device_inline float reduce_add(const float4& a);
 ccl_device_inline float average(const float4& a);
 ccl_device_inline float len(const float4& a);
 ccl_device_inline float4 normalize(const float4& a);
@@ -85,6 +84,7 @@ ccl_device_inline float4 select(const int4& mask,
                                 const float4& b);
 ccl_device_inline float4 reduce_min(const float4& a);
 ccl_device_inline float4 reduce_max(const float4& a);
+ccl_device_inline float4 reduce_add(const float4& a);
 #endif  /* !__KERNEL_GPU__ */
 
 /*******************************************************************************
@@ -275,24 +275,24 @@ ccl_device_inline bool is_zero(const float4& a)
 #endif
 }
 
-ccl_device_inline float reduce_add(const float4& a)
+ccl_device_inline float4 reduce_add(const float4& a)
 {
 #ifdef __KERNEL_SSE__
 #  ifdef __KERNEL_SSE3__
     float4 h(_mm_hadd_ps(a.m128, a.m128));
-    return  _mm_cvtss_f32(_mm_hadd_ps(h.m128, h.m128));
+    return float4( _mm_hadd_ps(h.m128, h.m128));
 #  else
 	float4 h(shuffle<1,0,3,2>(a) + a);
-	return  _mm_cvtss_f32(shuffle<2,3,0,1>(h) + h);
+	return  shuffle<2,3,0,1>(h) + h;
 #  endif
 #else
-	return ((a.x + a.y) + (a.z + a.w));
+	return make_float4(((a.x + a.y) + (a.z + a.w)));
 #endif
 }
 
 ccl_device_inline float average(const float4& a)
 {
-	return reduce_add(a) * 0.25f;
+	return reduce_add(a)[0] * 0.25f;
 }
 
 ccl_device_inline float len(const float4& a)
diff --git a/intern/cycles/util/util_math_matrix.h b/intern/cycles/util/util_math_matrix.h
index c7511f8306e..7269d391956 100644
--- a/intern/cycles/util/util_math_matrix.h
+++ b/intern/cycles/util/util_math_matrix.h
@@ -339,59 +339,59 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float
 }
 
 #ifdef __KERNEL_SSE3__
-ccl_device_inline void math_vector_zero_sse(__m128 *A, int n)
+ccl_device_inline void math_vector_zero_sse(float4 *A, int n)
 {
 	for(int i = 0; i < n; i++) {
-		A[i] = _mm_setzero_ps();
+		A[i] = make_float4(0.0f);
 	}
 }
 
-ccl_device_inline void math_matrix_zero_sse(__m128 *A, int n)
+ccl_device_inline void math_matrix_zero_sse(float4 *A, int n)
 {
 	for(int row = 0; row < n; row++) {
 		for(int col = 0; col <= row; col++) {
-			MAT(A, n, row, col) = _mm_setzero_ps();
+			MAT(A, n, row, col) = make_float4(0.0f);
 		}
 	}
 }
 
 /* Add Gramian matrix of v to A.
  * The Gramian matrix of v is v^T*v, so element (i,j) is v[i]*v[j]. */
-ccl_device_inline void math_matrix_add_gramian_sse(__m128 *A, int n, const __m128 *ccl_restrict v, __m128 weight)
+ccl_device_inline void math_matrix_add_gramian_sse(float4 *A, int n, const float4 *ccl_restrict v, float4 weight)
 {
 	for(int row = 0; row < n; row++) {
 		for(int col = 0; col <= row; col++) {
-			MAT(A, n, row, col) = _mm_add_ps(MAT(A, n, row, col), _mm_mul_ps(_mm_mul_ps(v[row], v[col]), weight));
+			MAT(A, n, row, col) = MAT(A, n, row, col) + v[row] * v[col] * weight;
 		}
 	}
 }
 
-ccl_device_inline void math_vector_add_sse(__m128 *V, int n, const __m128 *ccl_restrict a)
+ccl_device_inline void math_vector_add_sse(float4 *V, int n, const float4 *ccl_restrict a)
 {
 	for(int i = 0; i < n; i++) {
-		V[i] = _mm_add_ps(V[i], a[i]);
+		V[i] += a[i];
 	}
 }
 
-ccl_device_inline void math_vector_mul_sse(__m128 *V, int n, const __m128 *ccl_restrict a)
+ccl_device_inline void math_vector_mul_sse(float4 *V, int n, const float4 *ccl_restrict a)
 {
 	for(int i = 0; i < n; i++) {
-		V[i] = _mm_mul_ps(V[i], a[i]);
+		V[i] *= a[i];
 	}
 }
 
-ccl_device_inline void math_vector_max_sse(__m128 *a, const __m128 *ccl_restrict b, int n)
+ccl_device_inline void math_vector_max_sse(float4 *a, const float4 *ccl_restrict b, int n)
 {
 	for(int i = 0; i < n; i++) {
-		a[i] = _mm_max_ps(a[i], b[i]);
+		a[i] = max(a[i], b[i]);
 	}
 }
 
-ccl_device_inline void math_matrix_hsum(float *A, int n, const __m128 *ccl_restrict B)
+ccl_device_inline void math_matrix_hsum(float *A, int n, const float4 *ccl_restrict B)
 {
 	for(int row = 0; row < n; row++) {
 		for(int col = 0; col <= row; col++) {
-			MAT(A, n, row, col) = _mm_hsum_ss(MAT(B, n, row, col));
+			MAT(A, n, row, col) = reduce_add(MAT(B, n, row, col))[0];
 		}
 	}
 }
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index 66dd80420ae..a2b3247b207 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -417,39 +417,6 @@ __forceinline __m128 _mm_round_ps( __m128 value, const int flags )
 
 #endif /* !(defined(__SSE4_1__) || defined(__SSE4_2__)) */
 
-#undef _mm_fabs_ps
-#define _mm_fabs_ps(x) _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))
-
-/* Return a __m128 with every element set to the largest element of v. */
-ccl_device_inline __m128 _mm_hmax_ps(__m128 v)
-{
-  /* v[0, 1, 2, 3] => [0, 1, 0, 1] and [2, 3, 2, 3] => v[max(0, 2), max(1, 3), max(0, 2), max(1, 3)] */
-  v = _mm_max_ps(_mm_movehl_ps(v, v), _mm_movelh_ps(v, v));
-  /* v[max(0, 2), max(1, 3), max(0, 2), max(1, 3)] => [4 times max(1, 3)] and [4 times max(0, 2)] => v[4 times max(0, 1, 2, 3)] */
-  v = _mm_max_ps(_mm_movehdup_ps(v), _mm_moveldup_ps(v));
-  return v;
-}
-
-/* Return the sum of the four elements of x. */
-ccl_device_inline float _mm_hsum_ss(__m128 x)
-{
-    __m128 a = _mm_movehdup_ps(x);
-    __m128 b = _mm_add_ps(x, a);
-    return _mm_cvtss_f32(_mm_add_ss(_mm_movehl_ps(a, b), b));
-}
-
-/* Return a __m128 with every element set to the sum of the four elements of x. */
-ccl_device_inline __m128 _mm_hsum_ps(__m128 x)
-{
-    x = _mm_hadd_ps(x, x);
-    x = _mm_hadd_ps(x, x);
-    return x;
-}
-
-/* Replace elements of x with zero where mask isn't set. */
-#undef _mm_mask_ps
-#define _mm_mask_ps(x, mask) _mm_blendv_ps(_mm_setzero_ps(), x, mask)
-
 #else  /* __KERNEL_SSE2__ */
 
 /* This section is for utility functions which operates on non-register data

From 0f3f093d3b3d1b2fb9593233f23665d3135bea2d Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brechtvanlommel@gmail.com>
Date: Thu, 3 Aug 2017 16:41:50 +0200
Subject: [PATCH 09/11] Cycles: add HTML report to inspect failed test images.

Shows new, reference and diff renders, with mouse hover to flip between
new and ref for easy comparison. This generates a report.html in
build_dir/tests/cycles, stored along with the new and diff images.

Differential Revision: https://developer.blender.org/D2770
---
 tests/python/CMakeLists.txt         |   2 +
 tests/python/cycles_render_tests.py | 227 ++++++++++++++++++++++------
 2 files changed, 184 insertions(+), 45 deletions(-)

diff --git a/tests/python/CMakeLists.txt b/tests/python/CMakeLists.txt
index f2fea48d9f4..4ba89c2805d 100644
--- a/tests/python/CMakeLists.txt
+++ b/tests/python/CMakeLists.txt
@@ -518,6 +518,7 @@ if(WITH_CYCLES)
 					-blender "$<TARGET_FILE:blender>"
 					-testdir "${TEST_SRC_DIR}/cycles/ctests/${subject}"
 					-idiff "${OPENIMAGEIO_IDIFF}"
+					-outdir "${TEST_OUT_DIR}/cycles"
 				)
 			else()
 				add_test(
@@ -526,6 +527,7 @@ if(WITH_CYCLES)
 					-blender "$<TARGET_FILE:blender>"
 					-testdir "${TEST_SRC_DIR}/cycles/ctests/${subject}"
 					-idiff "${OPENIMAGEIO_IDIFF}"
+					-outdir "${TEST_OUT_DIR}/cycles"
 				)
 			endif()
 		endmacro()
diff --git a/tests/python/cycles_render_tests.py b/tests/python/cycles_render_tests.py
index a030cc5e0de..ea84f27ab7e 100755
--- a/tests/python/cycles_render_tests.py
+++ b/tests/python/cycles_render_tests.py
@@ -2,7 +2,9 @@
 # Apache License, Version 2.0
 
 import argparse
+import glob
 import os
+import pathlib
 import shutil
 import subprocess
 import sys
@@ -24,7 +26,7 @@ class COLORS_DUMMY:
 COLORS = COLORS_DUMMY
 
 
-def printMessage(type, status, message):
+def print_message(message, type=None, status=''):
     if type == 'SUCCESS':
         print(COLORS.GREEN, end="")
     elif type == 'FAILURE':
@@ -109,20 +111,126 @@ def test_get_name(filepath):
     filename = os.path.basename(filepath)
     return os.path.splitext(filename)[0]
 
-
-def verify_output(filepath):
+def test_get_images(filepath):
     testname = test_get_name(filepath)
     dirpath = os.path.dirname(filepath)
-    reference_dirpath = os.path.join(dirpath, "reference_renders")
-    reference_image = os.path.join(reference_dirpath, testname + ".png")
-    failed_image = os.path.join(reference_dirpath, testname + ".fail.png")
-    if not os.path.exists(reference_image):
+    ref_dirpath = os.path.join(dirpath, "reference_renders")
+    ref_img = os.path.join(ref_dirpath, testname + ".png")
+    new_dirpath = os.path.join(OUTDIR, os.path.basename(dirpath))
+    if not os.path.exists(new_dirpath):
+        os.makedirs(new_dirpath)
+    new_img = os.path.join(new_dirpath, testname + ".png")
+    diff_dirpath = os.path.join(OUTDIR, os.path.basename(dirpath), "diff")
+    if not os.path.exists(diff_dirpath):
+        os.makedirs(diff_dirpath)
+    diff_img = os.path.join(diff_dirpath, testname + ".diff.png")
+    return ref_img, new_img, diff_img
+
+
+class Report:
+    def __init__(self, testname):
+        self.failed_tests = ""
+        self.passed_tests = ""
+        self.testname = testname
+
+    def output(self):
+        # write intermediate data for single test
+        outdir = os.path.join(OUTDIR, self.testname)
+        f = open(os.path.join(outdir, "failed.data"), "w")
+        f.write(self.failed_tests)
+        f.close()
+
+        f = open(os.path.join(outdir, "passed.data"), "w")
+        f.write(self.passed_tests)
+        f.close()
+
+        # gather intermediate data for all tests
+        failed_data = sorted(glob.glob(os.path.join(OUTDIR, "*/failed.data")))
+        passed_data = sorted(glob.glob(os.path.join(OUTDIR, "*/passed.data")))
+
+        failed_tests = ""
+        passed_tests = ""
+
+        for filename in failed_data:
+            failed_tests += open(os.path.join(OUTDIR, filename), "r").read()
+        for filename in passed_data:
+            passed_tests += open(os.path.join(OUTDIR, filename), "r").read()
+
+        # write html for all tests
+        self.html = """
+<html>
+<head>
+    <title>Cycles Test Report</title>
+    <style>
+        img {{ image-rendering: pixelated; width: 256; background-color: #000; }}
+        table td:first-child {{ width: 100%; }}
+    </style>
+    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css">
+</head>
+<body>
+    <div class="container">
+        <br/>
+        <h1>Cycles Test Report</h1>
+        <br/>
+        <table class="table table-striped">
+            <thead class="thead-default">
+                <tr><th>Name</th><th>New</th><th>Reference</th><th>Diff</th>
+            </thead>
+            {}{}
+        </table>
+        <br/>
+    </div>
+</body>
+</html>
+            """ . format(failed_tests, passed_tests)
+
+        filepath = os.path.join(OUTDIR, "report.html")
+        f = open(filepath, "w")
+        f.write(self.html)
+        f.close()
+
+        print_message("Report saved to: " + pathlib.Path(filepath).as_uri())
+
+    def add_test(self, filepath, error):
+        name = test_get_name(filepath)
+
+        ref_img, new_img, diff_img = test_get_images(filepath)
+
+        status = error if error else ""
+        style = """ style="background-color: #f99;" """ if error else ""
+
+        new_url = pathlib.Path(new_img).as_uri()
+        ref_url = pathlib.Path(ref_img).as_uri()
+        diff_url = pathlib.Path(diff_img).as_uri()
+
+        test_html = """
+            <tr{}>
+                <td><b>{}</b><br/>{}<br/>{}</td>
+                <td><img src="{}" onmouseover="this.src='{}';" onmouseout="this.src='{}';"></td>
+                <td><img src="{}" onmouseover="this.src='{}';" onmouseout="this.src='{}';"></td>
+                <td><img src="{}"></td>
+            </tr>""" . format(style, name, self.testname, status,
+                              new_url, ref_url, new_url,
+                              ref_url, new_url, ref_url,
+                              diff_url)
+
+        if error:
+            self.failed_tests += test_html
+        else:
+            self.passed_tests += test_html
+
+
+def verify_output(report, filepath):
+    ref_img, new_img, diff_img = test_get_images(filepath)
+    if not os.path.exists(ref_img):
         return False
+
+    # diff test with threshold
     command = (
         IDIFF,
-        "-fail", "0.015",
+        "-fail", "0.016",
         "-failpercent", "1",
-        reference_image,
+        ref_img,
         TEMP_FILE,
         )
     try:
@@ -130,47 +238,66 @@ def verify_output(filepath):
         failed = False
     except subprocess.CalledProcessError as e:
         if VERBOSE:
-            print(e.output.decode("utf-8"))
+            print_message(e.output.decode("utf-8"))
         failed = e.returncode != 1
-    if failed:
-        shutil.copy(TEMP_FILE, failed_image)
-    elif os.path.exists(failed_image):
-        os.remove(failed_image)
+
+    # generate diff image
+    command = (
+        IDIFF,
+        "-o", diff_img,
+        "-abs", "-scale", "16",
+        ref_img,
+        TEMP_FILE
+        )
+
+    try:
+        subprocess.check_output(command)
+    except subprocess.CalledProcessError as e:
+        if VERBOSE:
+            print_message(e.output.decode("utf-8"))
+
+    # copy new image
+    if os.path.exists(new_img):
+        os.remove(new_img)
+    if os.path.exists(TEMP_FILE):
+        shutil.copy(TEMP_FILE, new_img)
+
     return not failed
 
 
-def run_test(filepath):
+def run_test(report, filepath):
     testname = test_get_name(filepath)
     spacer = "." * (32 - len(testname))
-    printMessage('SUCCESS', 'RUN', testname)
+    print_message(testname, 'SUCCESS', 'RUN')
     time_start = time.time()
     error = render_file(filepath)
     status = "FAIL"
     if not error:
-        if not verify_output(filepath):
+        if not verify_output(report, filepath):
             error = "VERIFY"
     time_end = time.time()
     elapsed_ms = int((time_end - time_start) * 1000)
     if not error:
-        printMessage('SUCCESS', 'OK', "{} ({} ms)" .
-                     format(testname, elapsed_ms))
+        print_message("{} ({} ms)" . format(testname, elapsed_ms),
+                      'SUCCESS', 'OK')
     else:
         if error == "NO_CYCLES":
-            print("Can't perform tests because Cycles failed to load!")
-            return False
+            print_message("Can't perform tests because Cycles failed to load!")
+            return error
         elif error == "NO_START":
-            print('Can not perform tests because blender fails to start.',
+            print_message('Can not perform tests because blender fails to start.',
                   'Make sure INSTALL target was run.')
-            return False
+            return error
         elif error == 'VERIFY':
-            print("Rendered result is different from reference image")
+            print_message("Rendered result is different from reference image")
         else:
-            print("Unknown error %r" % error)
-        printMessage('FAILURE', 'FAILED', "{} ({} ms)" .
-                     format(testname, elapsed_ms))
+            print_message("Unknown error %r" % error)
+        print_message("{} ({} ms)" . format(testname, elapsed_ms),
+                      'FAILURE', 'FAILED')
     return error
 
 
+
 def blend_list(path):
     for dirpath, dirnames, filenames in os.walk(path):
         for filename in filenames:
@@ -178,17 +305,18 @@ def blend_list(path):
                 filepath = os.path.join(dirpath, filename)
                 yield filepath
 
-
 def run_all_tests(dirpath):
     passed_tests = []
     failed_tests = []
     all_files = list(blend_list(dirpath))
     all_files.sort()
-    printMessage('SUCCESS', "==========",
-                 "Running {} tests from 1 test case." . format(len(all_files)))
+    report = Report(os.path.basename(dirpath))
+    print_message("Running {} tests from 1 test case." .
+                  format(len(all_files)),
+                  'SUCCESS', "==========")
     time_start = time.time()
     for filepath in all_files:
-        error = run_test(filepath)
+        error = run_test(report, filepath)
         testname = test_get_name(filepath)
         if error:
             if error == "NO_CYCLES":
@@ -198,28 +326,33 @@ def run_all_tests(dirpath):
             failed_tests.append(testname)
         else:
             passed_tests.append(testname)
+        report.add_test(filepath, error)
     time_end = time.time()
     elapsed_ms = int((time_end - time_start) * 1000)
-    print("")
-    printMessage('SUCCESS', "==========",
-                 "{} tests from 1 test case ran. ({} ms total)" .
-                 format(len(all_files), elapsed_ms))
-    printMessage('SUCCESS', 'PASSED', "{} tests." .
-                 format(len(passed_tests)))
+    print_message("")
+    print_message("{} tests from 1 test case ran. ({} ms total)" .
+                  format(len(all_files), elapsed_ms),
+                  'SUCCESS', "==========")
+    print_message("{} tests." .
+                  format(len(passed_tests)),
+                  'SUCCESS', 'PASSED')
     if failed_tests:
-        printMessage('FAILURE', 'FAILED', "{} tests, listed below:" .
-                     format(len(failed_tests)))
+        print_message("{} tests, listed below:" .
+                     format(len(failed_tests)),
+                     'FAILURE', 'FAILED')
         failed_tests.sort()
         for test in failed_tests:
-            printMessage('FAILURE', "FAILED", "{}" . format(test))
-        return False
-    return True
+            print_message("{}" . format(test), 'FAILURE', "FAILED")
+
+    report.output()
+    return not bool(failed_tests)
 
 
 def create_argparse():
     parser = argparse.ArgumentParser()
     parser.add_argument("-blender", nargs="+")
     parser.add_argument("-testdir", nargs=1)
+    parser.add_argument("-outdir", nargs=1)
     parser.add_argument("-idiff", nargs=1)
     return parser
 
@@ -229,7 +362,7 @@ def main():
     args = parser.parse_args()
 
     global COLORS
-    global BLENDER, ROOT, IDIFF
+    global BLENDER, TESTDIR, IDIFF, OUTDIR
     global TEMP_FILE, TEMP_FILE_MASK, TEST_SCRIPT
     global VERBOSE
 
@@ -237,8 +370,12 @@ def main():
         COLORS = COLORS_ANSI
 
     BLENDER = args.blender[0]
-    ROOT = args.testdir[0]
+    TESTDIR = args.testdir[0]
     IDIFF = args.idiff[0]
+    OUTDIR = args.outdir[0]
+
+    if not os.path.exists(OUTDIR):
+        os.makedirs(OUTDIR)
 
     TEMP = tempfile.mkdtemp()
     TEMP_FILE_MASK = os.path.join(TEMP, "test")
@@ -248,7 +385,7 @@ def main():
 
     VERBOSE = os.environ.get("BLENDER_VERBOSE") is not None
 
-    ok = run_all_tests(ROOT)
+    ok = run_all_tests(TESTDIR)
 
     # Cleanup temp files and folders
     if os.path.exists(TEMP_FILE):

From 771e6120c87d85fbaacfac2a89b21a36e9c1fc7e Mon Sep 17 00:00:00 2001
From: Sergey Sharybin <sergey.vfx@gmail.com>
Date: Mon, 7 Aug 2017 14:31:40 +0200
Subject: [PATCH 10/11] Cycles tests: CMake side changes to support recent
 files split in the SVN repo

---
 tests/python/CMakeLists.txt | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tests/python/CMakeLists.txt b/tests/python/CMakeLists.txt
index 4ba89c2805d..ca752993c79 100644
--- a/tests/python/CMakeLists.txt
+++ b/tests/python/CMakeLists.txt
@@ -534,11 +534,16 @@ if(WITH_CYCLES)
 		if(WITH_OPENGL_TESTS)
 			add_cycles_render_test(opengl)
 		endif()
-		add_cycles_render_test(image)
+		add_cycles_render_test(displacement)
+		add_cycles_render_test(image_data_types)
+		add_cycles_render_test(image_mapping)
+		add_cycles_render_test(image_texture_limit)
 		add_cycles_render_test(mblur)
 		add_cycles_render_test(reports)
 		add_cycles_render_test(render)
 		add_cycles_render_test(shader)
+		add_cycles_render_test(shadow_catcher)
+		add_cycles_render_test(volume)
 	else()
 		MESSAGE(STATUS "Disabling Cycles tests because tests folder does not exist")
 	endif()

From 580741b317ae60eb3bf999d636da0325c7e67373 Mon Sep 17 00:00:00 2001
From: Sergey Sharybin <sergey.vfx@gmail.com>
Date: Mon, 7 Aug 2017 14:47:51 +0200
Subject: [PATCH 11/11] Cycles: Cleanup, space after keyword

---
 intern/cycles/device/opencl/opencl_util.cpp   |  2 +-
 .../cycles/kernel/filter/filter_prefilter.h   |  2 +-
 intern/cycles/kernel/kernel_path_state.h      |  4 +--
 .../cycles/kernel/split/kernel_shader_sort.h  | 10 +++----
 intern/cycles/render/image.cpp                |  2 +-
 intern/cycles/render/light.cpp                |  2 +-
 intern/cycles/util/util_math_matrix.h         | 28 +++++++++----------
 7 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp
index 0d34af3e040..7d5173a5f1d 100644
--- a/intern/cycles/device/opencl/opencl_util.cpp
+++ b/intern/cycles/device/opencl/opencl_util.cpp
@@ -635,7 +635,7 @@ bool OpenCLInfo::device_supported(const string& platform_name,
 			"Tahiti", "Pitcairn", "Capeverde", "Oland",
 			NULL
 		};
-		for (int i = 0; blacklist[i] != NULL; i++) {
+		for(int i = 0; blacklist[i] != NULL; i++) {
 			if(device_name == blacklist[i]) {
 				VLOG(1) << "AMD device " << device_name << " not supported";
 				return false;
diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h
index a0b89c1111f..c6a70cbeab5 100644
--- a/intern/cycles/kernel/filter/filter_prefilter.h
+++ b/intern/cycles/kernel/filter/filter_prefilter.h
@@ -96,7 +96,7 @@ ccl_device void kernel_filter_get_feature(int sample,
 	int idx = (y-rect.y)*buffer_w + (x - rect.x);
 
 	mean[idx] = center_buffer[m_offset] / sample;
-	if (sample > 1) {
+	if(sample > 1) {
 		if(use_split_variance) {
 			variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1)));
 		}
diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h
index a96ffe07718..3ce183bf67a 100644
--- a/intern/cycles/kernel/kernel_path_state.h
+++ b/intern/cycles/kernel/kernel_path_state.h
@@ -173,7 +173,7 @@ ccl_device_inline float path_state_terminate_probability(KernelGlobals *kg, ccl_
 		}
 #ifdef __SHADOW_TRICKS__
 		/* Exception for shadow catcher not working correctly with RR. */
-		else if ((state->flag & PATH_RAY_SHADOW_CATCHER) && (state->transparent_bounce <= 8)) {
+		else if((state->flag & PATH_RAY_SHADOW_CATCHER) && (state->transparent_bounce <= 8)) {
 			return 1.0f;
 		}
 #endif
@@ -196,7 +196,7 @@ ccl_device_inline float path_state_terminate_probability(KernelGlobals *kg, ccl_
 		}
 #ifdef __SHADOW_TRICKS__
 		/* Exception for shadow catcher not working correctly with RR. */
-		else if ((state->flag & PATH_RAY_SHADOW_CATCHER) && (state->bounce <= 3)) {
+		else if((state->flag & PATH_RAY_SHADOW_CATCHER) && (state->bounce <= 3)) {
 			return 1.0f;
 		}
 #endif
diff --git a/intern/cycles/kernel/split/kernel_shader_sort.h b/intern/cycles/kernel/split/kernel_shader_sort.h
index 297decb0bc2..5a55b680695 100644
--- a/intern/cycles/kernel/split/kernel_shader_sort.h
+++ b/intern/cycles/kernel/split/kernel_shader_sort.h
@@ -39,7 +39,7 @@ ccl_device void kernel_shader_sort(KernelGlobals *kg,
 	ccl_local ushort *local_index = &locals->local_index[0];
 
 	/* copy to local memory */
-	for (uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
+	for(uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
 		uint idx = offset + i + lid;
 		uint add = input + idx;
 		uint value = (~0);
@@ -59,9 +59,9 @@ ccl_device void kernel_shader_sort(KernelGlobals *kg,
 #  ifdef __KERNEL_OPENCL__
 
 	/* bitonic sort */
-	for (uint length = 1; length < SHADER_SORT_BLOCK_SIZE; length <<= 1) {
-		for (uint inc = length; inc > 0; inc >>= 1) {
-			for (uint ii = 0; ii < SHADER_SORT_BLOCK_SIZE; ii += SHADER_SORT_LOCAL_SIZE) {
+	for(uint length = 1; length < SHADER_SORT_BLOCK_SIZE; length <<= 1) {
+		for(uint inc = length; inc > 0; inc >>= 1) {
+			for(uint ii = 0; ii < SHADER_SORT_BLOCK_SIZE; ii += SHADER_SORT_LOCAL_SIZE) {
 				uint i = lid + ii;
 				bool direction = ((i & (length << 1)) != 0);
 				uint j = i ^ inc;
@@ -81,7 +81,7 @@ ccl_device void kernel_shader_sort(KernelGlobals *kg,
 #  endif /* __KERNEL_OPENCL__ */
 
 	/* copy to destination */
-	for (uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
+	for(uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
 		uint idx = offset + i + lid;
 		uint lidx = local_index[i + lid];
 		uint outi = output + idx;
diff --git a/intern/cycles/render/image.cpp b/intern/cycles/render/image.cpp
index 02b65440154..a490f10aee4 100644
--- a/intern/cycles/render/image.cpp
+++ b/intern/cycles/render/image.cpp
@@ -344,7 +344,7 @@ int ImageManager::add_image(const string& filename,
 	else {
 		/* Very unlikely, since max_num_images is insanely big. But better safe than sorry. */
 		int tex_count = 0;
-		for (int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
+		for(int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
 			tex_count += tex_num_images[type];
 		}
 		if(tex_count > max_num_images) {
diff --git a/intern/cycles/render/light.cpp b/intern/cycles/render/light.cpp
index 93d88c5642c..371ea54ef11 100644
--- a/intern/cycles/render/light.cpp
+++ b/intern/cycles/render/light.cpp
@@ -225,7 +225,7 @@ void LightManager::disable_ineffective_light(Device *device, Scene *scene)
 bool LightManager::object_usable_as_light(Object *object) {
 	Mesh *mesh = object->mesh;
 	/* Skip objects with NaNs */
-	if (!object->bounds.valid()) {
+	if(!object->bounds.valid()) {
 		return false;
 	}
 	/* Skip if we are not visible for BSDFs. */
diff --git a/intern/cycles/util/util_math_matrix.h b/intern/cycles/util/util_math_matrix.h
index 7269d391956..b31dbe4fc67 100644
--- a/intern/cycles/util/util_math_matrix.h
+++ b/intern/cycles/util/util_math_matrix.h
@@ -223,20 +223,20 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float
 {
 	const float singular_epsilon = 1e-9f;
 
-	for (int row = 0; row < n; row++) {
-		for (int col = 0; col < n; col++) {
+	for(int row = 0; row < n; row++) {
+		for(int col = 0; col < n; col++) {
 			MATS(V, n, row, col, v_stride) = (col == row) ? 1.0f : 0.0f;
 		}
 	}
 
-	for (int sweep = 0; sweep < 8; sweep++) {
+	for(int sweep = 0; sweep < 8; sweep++) {
 		float off_diagonal = 0.0f;
-		for (int row = 1; row < n; row++) {
-			for (int col = 0; col < row; col++) {
+		for(int row = 1; row < n; row++) {
+			for(int col = 0; col < row; col++) {
 				off_diagonal += fabsf(MAT(A, n, row, col));
 			}
 		}
-		if (off_diagonal < 1e-7f) {
+		if(off_diagonal < 1e-7f) {
 			/* The matrix has nearly reached diagonal form.
 			 * Since the eigenvalues are only used to determine truncation, their exact values aren't required - a relative error of a few ULPs won't matter at all. */
 			break;
@@ -253,7 +253,7 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float
 				float abs_element = fabsf(element);
 
 				/* If we're in a later sweep and the element already is very small, just set it to zero and skip the rotation. */
-				if (sweep > 3 && abs_element <= singular_epsilon*fabsf(MAT(A, n, row, row)) && abs_element <= singular_epsilon*fabsf(MAT(A, n, col, col))) {
+				if(sweep > 3 && abs_element <= singular_epsilon*fabsf(MAT(A, n, row, row)) && abs_element <= singular_epsilon*fabsf(MAT(A, n, col, col))) {
 					MAT(A, n, row, col) = 0.0f;
 					continue;
 				}
@@ -272,10 +272,10 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float
 				 * Then, we compute sin(phi) and cos(phi) themselves. */
 				float singular_diff = MAT(A, n, row, row) - MAT(A, n, col, col);
 				float ratio;
-				if (abs_element > singular_epsilon*fabsf(singular_diff)) {
+				if(abs_element > singular_epsilon*fabsf(singular_diff)) {
 					float cot_2phi = 0.5f*singular_diff / element;
 					ratio = 1.0f / (fabsf(cot_2phi) + sqrtf(1.0f + cot_2phi*cot_2phi));
-					if (cot_2phi < 0.0f) ratio = -ratio; /* Copy sign. */
+					if(cot_2phi < 0.0f) ratio = -ratio; /* Copy sign. */
 				}
 				else {
 					ratio = element / singular_diff;
@@ -315,21 +315,21 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float
 	}
 
 	/* Sort eigenvalues and the associated eigenvectors. */
-	for (int i = 0; i < n - 1; i++) {
+	for(int i = 0; i < n - 1; i++) {
 		float v = MAT(A, n, i, i);
 		int k = i;
-		for (int j = i; j < n; j++) {
-			if (MAT(A, n, j, j) >= v) {
+		for(int j = i; j < n; j++) {
+			if(MAT(A, n, j, j) >= v) {
 				v = MAT(A, n, j, j);
 				k = j;
 			}
 		}
-		if (k != i) {
+		if(k != i) {
 			/* Swap eigenvalues. */
 			MAT(A, n, k, k) = MAT(A, n, i, i);
 			MAT(A, n, i, i) = v;
 			/* Swap eigenvectors. */
-			for (int j = 0; j < n; j++) {
+			for(int j = 0; j < n; j++) {
 				float v = MATS(V, n, i, j, v_stride);
 				MATS(V, n, i, j, v_stride) = MATS(V, n, k, j, v_stride);
 				MATS(V, n, k, j, v_stride) = v;