Cycles: optimization for BVH traveral on CPU's with SSE3, using code from Embree.

On the BMW scene, this gives roughly a 10% speedup overall with clang/gcc, and 30% speedup with visual studio (2008). It turns out visual studio was optimizing the existing code quite poorly compared to pretty good autovectorization by clang/gcc, but hand written SSE code also gives a smaller speed boost there. This code isn't enabled when using the hair minimum width feature yet, need to make that work with the SSE code still.
2013-06-18 09:36:06 +00:00
parent 9131adca9f
commit d57c6748c4
5 changed files with 278 additions and 127 deletions
--- a/intern/cycles/bvh/bvh.cpp
+++ b/intern/cycles/bvh/bvh.cpp
@@ -552,9 +552,9 @@ void RegularBVH::pack_node(int idx, const BoundBox& b0, const BoundBox& b1, int
 {
 	int4 data[BVH_NODE_SIZE] =
 	{
-		make_int4(__float_as_int(b0.min.x), __float_as_int(b0.max.x), __float_as_int(b0.min.y), __float_as_int(b0.max.y)),
-		make_int4(__float_as_int(b1.min.x), __float_as_int(b1.max.x), __float_as_int(b1.min.y), __float_as_int(b1.max.y)),
-		make_int4(__float_as_int(b0.min.z), __float_as_int(b0.max.z), __float_as_int(b1.min.z), __float_as_int(b1.max.z)),
+		make_int4(__float_as_int(b0.min.x), __float_as_int(b1.min.x), __float_as_int(b0.max.x), __float_as_int(b1.max.x)),
+		make_int4(__float_as_int(b0.min.y), __float_as_int(b1.min.y), __float_as_int(b0.max.y), __float_as_int(b1.max.y)),
+		make_int4(__float_as_int(b0.min.z), __float_as_int(b1.min.z), __float_as_int(b0.max.z), __float_as_int(b1.max.z)),
 		make_int4(c0, c1, visibility0, visibility1)
 	};

--- a/intern/cycles/kernel/kernel_bvh.h
+++ b/intern/cycles/kernel/kernel_bvh.h
@@ -112,80 +112,8 @@ __device_inline void bvh_instance_motion_pop(KernelGlobals *kg, int object, cons
 }
 #endif

-/* intersect two bounding boxes */
-#ifdef __HAIR__
-__device_inline void bvh_node_intersect(KernelGlobals *kg,
-	bool *traverseChild0, bool *traverseChild1,
-	bool *closestChild1, int *nodeAddr0, int *nodeAddr1,
-	float3 P, float3 idir, float t, uint visibility, int nodeAddr, float difl, float extmax)
-{
-#else
-__device_inline void bvh_node_intersect(KernelGlobals *kg,
-	bool *traverseChild0, bool *traverseChild1,
-	bool *closestChild1, int *nodeAddr0, int *nodeAddr1,
-	float3 P, float3 idir, float t, uint visibility, int nodeAddr)
-{
-#endif
-
-	/* fetch node data */
-	float4 n0xy = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0);
-	float4 n1xy = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1);
-	float4 nz = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2);
-	float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3);
-
-	/* intersect ray against child nodes */
-	float3 ood = P * idir;
-	NO_EXTENDED_PRECISION float c0lox = n0xy.x * idir.x - ood.x;
-	NO_EXTENDED_PRECISION float c0hix = n0xy.y * idir.x - ood.x;
-	NO_EXTENDED_PRECISION float c0loy = n0xy.z * idir.y - ood.y;
-	NO_EXTENDED_PRECISION float c0hiy = n0xy.w * idir.y - ood.y;
-	NO_EXTENDED_PRECISION float c0loz = nz.x * idir.z - ood.z;
-	NO_EXTENDED_PRECISION float c0hiz = nz.y * idir.z - ood.z;
-	NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
-	NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
-
-	NO_EXTENDED_PRECISION float c1loz = nz.z * idir.z - ood.z;
-	NO_EXTENDED_PRECISION float c1hiz = nz.w * idir.z - ood.z;
-	NO_EXTENDED_PRECISION float c1lox = n1xy.x * idir.x - ood.x;
-	NO_EXTENDED_PRECISION float c1hix = n1xy.y * idir.x - ood.x;
-	NO_EXTENDED_PRECISION float c1loy = n1xy.z * idir.y - ood.y;
-	NO_EXTENDED_PRECISION float c1hiy = n1xy.w * idir.y - ood.y;
-	NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
-	NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
-
-#ifdef __HAIR__
-	if(difl != 0.0f) {
-		float hdiff = 1.0f + difl;
-		float ldiff = 1.0f - difl;
-		if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) {
-			c0min = max(ldiff * c0min, c0min - extmax);
-			c0max = min(hdiff * c0max, c0max + extmax);
-		}
-		if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) {
-			c1min = max(ldiff * c1min, c1min - extmax);
-			c1max = min(hdiff * c1max, c1max + extmax);
-		}
-	}
-#endif
-
-	/* decide which nodes to traverse next */
-#ifdef __VISIBILITY_FLAG__
-	/* this visibility test gives a 5% performance hit, how to solve? */
-	*traverseChild0 = (c0max >= c0min) && (__float_as_uint(cnodes.z) & visibility);
-	*traverseChild1 = (c1max >= c1min) && (__float_as_uint(cnodes.w) & visibility);
-#else
-	*traverseChild0 = (c0max >= c0min);
-	*traverseChild1 = (c1max >= c1min);
-#endif
-
-	*nodeAddr0 = __float_as_int(cnodes.x);
-	*nodeAddr1 = __float_as_int(cnodes.y);
-
-	*closestChild1 = (c1min < c0min);
-}
-
 /* Sven Woop's algorithm */
-__device_inline void bvh_triangle_intersect(KernelGlobals *kg, Intersection *isect,
+__device_inline bool bvh_triangle_intersect(KernelGlobals *kg, Intersection *isect,
 	float3 P, float3 idir, uint visibility, int object, int triAddr)
 {
 	/* compute and check intersection t-value */
@@ -223,10 +151,13 @@ __device_inline void bvh_triangle_intersect(KernelGlobals *kg, Intersection *ise
 					isect->u = u;
 					isect->v = v;
 					isect->t = t;
+					return true;
 				}
 			}
 		}
 	}
+
+	return false;
 }

 #ifdef __HAIR__
@@ -280,7 +211,7 @@ __device_inline void curvebounds(float *lower, float *upper, float *extremta, fl
 	}
 }

-__device_inline void bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
+__device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
 	float3 P, float3 idir, uint visibility, int object, int curveAddr, int segment, uint *lcg_state, float difl, float extmax)
 {
 	float epsilon = 0.0f;
@@ -346,7 +277,7 @@ __device_inline void bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersectio
 	float zextrem[4];
 	curvebounds(&lower, &upper, &zextrem[0], &zextrem[1], &zextrem[2], &zextrem[3], curve_coef[0].z, curve_coef[1].z, curve_coef[2].z, curve_coef[3].z);
 	if(lower - r_curr > isect->t || upper + r_curr < epsilon)
-		return;
+		return false;

 	/*minimum width extension*/
 	float mw_extension = min(difl * fabsf(upper), extmax);
@@ -355,17 +286,18 @@ __device_inline void bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersectio
 	float xextrem[4];
 	curvebounds(&lower, &upper, &xextrem[0], &xextrem[1], &xextrem[2], &xextrem[3], curve_coef[0].x, curve_coef[1].x, curve_coef[2].x, curve_coef[3].x);
 	if(lower > r_ext || upper < -r_ext)
-		return;
+		return false;

 	float yextrem[4];
 	curvebounds(&lower, &upper, &yextrem[0], &yextrem[1], &yextrem[2], &yextrem[3], curve_coef[0].y, curve_coef[1].y, curve_coef[2].y, curve_coef[3].y);
 	if(lower > r_ext || upper < -r_ext)
-		return;
+		return false;

 	/*setup recurrent loop*/
 	int level = 1 << depth;
 	int tree = 0;
 	float resol = 1.0f / (float)level;
+	bool hit = false;

 	/*begin loop*/
 	while(!(tree >> (depth))) {
@@ -557,7 +489,7 @@ __device_inline void bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersectio
 			/*stochastic fade from minimum width*/
 			if(lcg_state && coverage != 1.0f) {
 				if(lcg_step(lcg_state) > coverage)
-					return;
+					return hit;
 			}

 #ifdef __VISIBILITY_FLAG__
@@ -574,6 +506,7 @@ __device_inline void bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersectio
 				isect->v = 0.0f;
 				/*isect->v = 1.0f - coverage; */
 				isect->t = t;
+				hit = true;
 			}
 			
 			tree++;
@@ -584,9 +517,11 @@ __device_inline void bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersectio
 			level = level >> 1;
 		}
 	}
+
+	return hit;
 }

-__device_inline void bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
+__device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
 	float3 P, float3 idir, uint visibility, int object, int curveAddr, int segment, uint *lcg_state, float difl, float extmax)
 {
 	/* curve Intersection check */
@@ -630,7 +565,7 @@ __device_inline void bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
 	sphere_b = dot(dir,sphere_dif);
 	float sdisc = sphere_b * sphere_b - len_squared(sphere_dif) + sp_r * sp_r;
 	if(sdisc < 0.0f)
-		return;
+		return false;

 	/* obtain parameters and test midpoint distance for suitable modes*/
 	float3 tg = (p2 - p1) / l;
@@ -645,9 +580,9 @@ __device_inline void bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
 	float zcentre = difz + (dirz * tcentre);

 	if((tcentre > isect->t) && !(flags & CURVE_KN_ACCURATE))
-		return;
+		return false;
 	if((zcentre < 0 || zcentre > l) && !(flags & CURVE_KN_ACCURATE) && !(flags & CURVE_KN_INTERSECTCORRECTION))
-		return;
+		return false;

 	/* test minimum separation*/
 	float3 cprod = cross(tg, dir);
@@ -662,7 +597,7 @@ __device_inline void bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
 		distscaled = (distscaled*distscaled)/cprodsq;

 	if(distscaled > mr*mr)
-		return;
+		return false;

 	/* calculate true intersection*/
 	float3 tdif = P - p1 + tcentre * dir;
@@ -672,7 +607,7 @@ __device_inline void bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
 	float td = tb*tb - 4*a*tc;

 	if (td < 0.0f)
-		return;
+		return false;

 	float rootd = 0.0f;
 	float correction = 0.0f;
@@ -706,7 +641,7 @@ __device_inline void bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
 		adjradius = adjradius / (r1 + z * gd);
 		if(lcg_state && adjradius != 1.0f) {
 			if(lcg_step(lcg_state) > adjradius)
-				return;
+				return false;
 		}
 		/* --- */

@@ -719,7 +654,7 @@ __device_inline void bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
 					float a2 = 1.0f - (dirz*dirz*(1 + gd*gd*enc_ratio*enc_ratio));
 					float c2 = dot(dif,dif) - difz * difz * (1 + gd*gd*enc_ratio*enc_ratio) - r1*r1*enc_ratio*enc_ratio - 2*r1*difz*gd*enc_ratio;
 					if(a2*c2 < 0.0f)
-						return;
+						return false;
 				}
 			}

@@ -740,9 +675,13 @@ __device_inline void bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,

 				if(backface) 
 					isect->u = -isect->u;
+				
+				return true;
 			}
 		}
 	}
+
+	return false;
 }
 #endif

@@ -751,7 +690,7 @@ __device_inline void bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
 * only want to intersect with primitives in the same object, and if case of
 * multiple hits we pick a single random primitive as the intersection point. */

-__device_inline void bvh_triangle_intersect_subsurface(KernelGlobals *kg, Intersection *isect,
+__device_inline bool bvh_triangle_intersect_subsurface(KernelGlobals *kg, Intersection *isect,
 	float3 P, float3 idir, int object, int triAddr, float tmax, int *num_hits, float subsurface_random)
 {
 	/* compute and check intersection t-value */
@@ -786,10 +725,13 @@ __device_inline void bvh_triangle_intersect_subsurface(KernelGlobals *kg, Inters
 					isect->u = u;
 					isect->v = v;
 					isect->t = t;
+					return true;
 				}
 			}
 		}
 	}
+
+	return false;
 }
 #endif

--- a/intern/cycles/kernel/kernel_bvh_traversal.h
+++ b/intern/cycles/kernel/kernel_bvh_traversal.h
@@ -1,6 +1,8 @@
 /*
- * Adapted from code Copyright 2009-2010 NVIDIA Corporation
- * Modifications Copyright 2011, Blender Foundation.
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2013, Blender Foundation.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -41,6 +43,14 @@ __device bool BVH_FUNCTION_NAME
 #endif
 )
 {
+	/* todo:
+	 * - test if pushing distance on the stack helps (for non shadow rays)
+	 * - separate version for shadow rays
+	 * - likely and unlikely for if() statements
+	 * - SSE for hair
+	 * - test restrict attribute for pointers
+	 */
+	
 	/* traversal stack in CUDA thread-local memory */
 	int traversalStack[BVH_STACK_SIZE];
 	traversalStack[0] = ENTRYPOINT_SENTINEL;
@@ -70,6 +80,28 @@ __device bool BVH_FUNCTION_NAME
 	isect->u = 0.0f;
 	isect->v = 0.0f;

+#if defined(__KERNEL_SSE3__) && !FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+	const __m128i shuffle_identity = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+	const __m128i shuffle_swap = _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
+
+	const __m128i pn = _mm_set_epi32(0x80000000, 0x80000000, 0x00000000, 0x00000000);
+	__m128 Psplat[3], idirsplat[3];
+
+	Psplat[0] = _mm_set_ps1(P.x);
+	Psplat[1] = _mm_set_ps1(P.y);
+	Psplat[2] = _mm_set_ps1(P.z);
+
+	idirsplat[0] = _mm_xor_ps(_mm_set_ps1(idir.x), _mm_castsi128_ps(pn));
+	idirsplat[1] = _mm_xor_ps(_mm_set_ps1(idir.y), _mm_castsi128_ps(pn));
+	idirsplat[2] = _mm_xor_ps(_mm_set_ps1(idir.z), _mm_castsi128_ps(pn));
+
+	__m128 tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f);
+
+	__m128i shufflex = (idir.x >= 0)? shuffle_identity: shuffle_swap;
+	__m128i shuffley = (idir.y >= 0)? shuffle_identity: shuffle_swap;
+	__m128i shufflez = (idir.z >= 0)? shuffle_identity: shuffle_swap;
+#endif
+
 	/* traversal loop */
 	do {
 		do
@@ -77,46 +109,121 @@ __device bool BVH_FUNCTION_NAME
 			/* traverse internal nodes */
 			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL)
 			{
-				bool traverseChild0, traverseChild1, closestChild1;
+				bool traverseChild0, traverseChild1;
 				int nodeAddrChild1;
+				float t = isect->t;
+
+#if !defined(__KERNEL_SSE3__) || FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+				/* Intersect two child bounding boxes, non-SSE version */
+
+				/* fetch node data */
+				float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0);
+				float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1);
+				float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2);
+				float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3);
+
+				/* intersect ray against child nodes */
+				NO_EXTENDED_PRECISION float c0lox = (node0.x - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c0hix = (node0.z - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c0loy = (node1.x - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c0hiy = (node1.z - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c0loz = (node2.x - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c0hiz = (node2.z - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
+				NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
+
+				NO_EXTENDED_PRECISION float c1lox = (node0.y - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c1hix = (node0.w - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c1loy = (node1.y - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c1hiy = (node1.w - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c1loz = (node2.y - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c1hiz = (node2.w - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
+				NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);

 #if FEATURE(BVH_HAIR_MINIMUM_WIDTH) && !FEATURE(BVH_SUBSURFACE)
-				bvh_node_intersect(kg, &traverseChild0, &traverseChild1,
-					&closestChild1, &nodeAddr, &nodeAddrChild1,
-					P, idir, isect->t, visibility, nodeAddr, difl, extmax);
-#else
-				bvh_node_intersect(kg, &traverseChild0, &traverseChild1,
-					&closestChild1, &nodeAddr, &nodeAddrChild1,
-#ifdef __HAIR__
-					P, idir, isect->t, visibility, nodeAddr, 0.0f, 0.0f);
-#else
-					P, idir, isect->t, visibility, nodeAddr);
-#endif
+				if(difl != 0.0f) {
+					float hdiff = 1.0f + difl;
+					float ldiff = 1.0f - difl;
+					if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) {
+						c0min = max(ldiff * c0min, c0min - extmax);
+						c0max = min(hdiff * c0max, c0max + extmax);
+					}
+					if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) {
+						c1min = max(ldiff * c1min, c1min - extmax);
+						c1max = min(hdiff * c1max, c1max + extmax);
+					}
+				}
 #endif

-				if(traverseChild0 != traverseChild1) {
+				/* decide which nodes to traverse next */
+#ifdef __VISIBILITY_FLAG__
+				/* this visibility test gives a 5% performance hit, how to solve? */
+				traverseChild0 = (c0max >= c0min) && (__float_as_uint(cnodes.z) & visibility);
+				traverseChild1 = (c1max >= c1min) && (__float_as_uint(cnodes.w) & visibility);
+#else
+				traverseChild0 = (c0max >= c0min);
+				traverseChild1 = (c1max >= c1min);
+#endif
+
+#else // __KERNEL_SSE3__
+				/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
+
+				/* fetch node data */
+				__m128 *bvh_nodes = (__m128*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
+				float4 cnodes = ((float4*)bvh_nodes)[3];
+
+				/* intersect ray against child nodes */
+				const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle8(bvh_nodes[0], shufflex), Psplat[0]), idirsplat[0]);
+				const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle8(bvh_nodes[1], shuffley), Psplat[1]), idirsplat[1]);
+				const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle8(bvh_nodes[2], shufflez), Psplat[2]), idirsplat[2]);
+
+				const __m128 tminmax = _mm_xor_ps(_mm_max_ps(_mm_max_ps(tminmaxx, tminmaxy), _mm_max_ps(tminmaxz, tsplat)), _mm_castsi128_ps(pn));
+				const __m128 lrhit = _mm_cmple_ps(tminmax, shuffle8(tminmax, shuffle_swap));
+
+				/* decide which nodes to traverse next */
+#ifdef __VISIBILITY_FLAG__
+				/* this visibility test gives a 5% performance hit, how to solve? */
+				traverseChild0 = (_mm_movemask_ps(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility);
+				traverseChild1 = (_mm_movemask_ps(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility);
+#else
+				traverseChild0 = (_mm_movemask_ps(lrhit) & 1);
+				traverseChild1 = (_mm_movemask_ps(lrhit) & 2);
+#endif
+#endif // __KERNEL_SSE3__
+
+				nodeAddr = __float_as_int(cnodes.x);
+				nodeAddrChild1 = __float_as_int(cnodes.y);
+
+				if(traverseChild0 && traverseChild1) {
+					/* both children were intersected, push the farther one */
+#if !defined(__KERNEL_SSE3__) || FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+					bool closestChild1 = (c1min < c0min);
+#else
+					union { __m128 m128; float v[4]; } uminmax;
+					uminmax.m128 = tminmax;
+					bool closestChild1 = uminmax.v[1] < uminmax.v[0];
+#endif
+
+					if(closestChild1) {
+						int tmp = nodeAddr;
+						nodeAddr = nodeAddrChild1;
+						nodeAddrChild1 = tmp;
+					}
+
+					++stackPtr;
+					traversalStack[stackPtr] = nodeAddrChild1;
+				}
+				else {
 					/* one child was intersected */
 					if(traverseChild1) {
 						nodeAddr = nodeAddrChild1;
 					}
-				}
-				else {
-					if(!traverseChild0) {
+					else if(!traverseChild0) {
 						/* neither child was intersected */
 						nodeAddr = traversalStack[stackPtr];
 						--stackPtr;
 					}
-					else {
-						/* both children were intersected, push the farther one */
-						if(closestChild1) {
-							int tmp = nodeAddr;
-							nodeAddr = nodeAddrChild1;
-							nodeAddrChild1 = tmp;
-						}
-
-						++stackPtr;
-						traversalStack[stackPtr] = nodeAddrChild1;
-					}
 				}
 			}

@@ -136,6 +243,7 @@ __device bool BVH_FUNCTION_NAME

 					/* primitive intersection */
 					while(primAddr < primAddr2) {
+						bool hit;
 #if FEATURE(BVH_SUBSURFACE)
 						/* only primitives from the same object */
 						uint tri_object = (object == ~0)? kernel_tex_fetch(__prim_object, primAddr): object;
@@ -148,15 +256,16 @@ __device bool BVH_FUNCTION_NAME
 							uint segment = kernel_tex_fetch(__prim_segment, primAddr);
 #if !FEATURE(BVH_SUBSURFACE)
 							if(segment != ~0) {
+
 								if(kernel_data.curve_kernel_data.curveflags & CURVE_KN_INTERPOLATE) 
 #if FEATURE(BVH_HAIR_MINIMUM_WIDTH)
-									bvh_cardinal_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment, lcg_state, difl, extmax);
+									hit = bvh_cardinal_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment, lcg_state, difl, extmax);
 								else
-									bvh_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment, lcg_state, difl, extmax);
+									hit = bvh_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment, lcg_state, difl, extmax);
 #else
-									bvh_cardinal_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment);
+									hit = bvh_cardinal_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment);
 								else
-									bvh_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment);
+									hit = bvh_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment);
 #endif
 							}
 							else
@@ -166,15 +275,25 @@ __device bool BVH_FUNCTION_NAME
 #if FEATURE(BVH_HAIR)
 							if(segment == ~0)
 #endif
-								bvh_triangle_intersect_subsurface(kg, isect, P, idir, object, primAddr, tmax, &num_hits, subsurface_random);
+								hit = bvh_triangle_intersect_subsurface(kg, isect, P, idir, object, primAddr, tmax, &num_hits, subsurface_random);

 						}
 #else
-								bvh_triangle_intersect(kg, isect, P, idir, visibility, object, primAddr);
+								hit = bvh_triangle_intersect(kg, isect, P, idir, visibility, object, primAddr);

 							/* shadow ray early termination */
-							if(visibility == PATH_RAY_SHADOW_OPAQUE && isect->prim != ~0)
+#if defined(__KERNEL_SSE3__) && !FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+							if(hit) {
+								if(visibility == PATH_RAY_SHADOW_OPAQUE)
+									return true;
+
+								tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f);
+							}
+#else
+							if(hit && visibility == PATH_RAY_SHADOW_OPAQUE)
 								return true;
+#endif
+
 #endif

 						primAddr++;
@@ -196,6 +315,22 @@ __device bool BVH_FUNCTION_NAME
 						bvh_instance_push(kg, object, ray, &P, &idir, &isect->t, tmax);
 #endif

+#if defined(__KERNEL_SSE3__) && !FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+						Psplat[0] = _mm_set_ps1(P.x);
+						Psplat[1] = _mm_set_ps1(P.y);
+						Psplat[2] = _mm_set_ps1(P.z);
+
+						idirsplat[0] = _mm_xor_ps(_mm_set_ps1(idir.x), _mm_castsi128_ps(pn));
+						idirsplat[1] = _mm_xor_ps(_mm_set_ps1(idir.y), _mm_castsi128_ps(pn));
+						idirsplat[2] = _mm_xor_ps(_mm_set_ps1(idir.z), _mm_castsi128_ps(pn));
+
+						tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f);
+
+						shufflex = (idir.x >= 0)? shuffle_identity: shuffle_swap;
+						shuffley = (idir.y >= 0)? shuffle_identity: shuffle_swap;
+						shufflez = (idir.z >= 0)? shuffle_identity: shuffle_swap;
+#endif
+
 						++stackPtr;
 						traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;

@@ -223,6 +358,23 @@ __device bool BVH_FUNCTION_NAME
 #else
 			bvh_instance_pop(kg, object, ray, &P, &idir, &isect->t, tmax);
 #endif
+
+#if defined(__KERNEL_SSE3__) && !FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+			Psplat[0] = _mm_set_ps1(P.x);
+			Psplat[1] = _mm_set_ps1(P.y);
+			Psplat[2] = _mm_set_ps1(P.z);
+
+			idirsplat[0] = _mm_xor_ps(_mm_set_ps1(idir.x), _mm_castsi128_ps(pn));
+			idirsplat[1] = _mm_xor_ps(_mm_set_ps1(idir.y), _mm_castsi128_ps(pn));
+			idirsplat[2] = _mm_xor_ps(_mm_set_ps1(idir.z), _mm_castsi128_ps(pn));
+
+			tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f);
+
+			shufflex = (idir.x >= 0)? shuffle_identity: shuffle_swap;
+			shuffley = (idir.y >= 0)? shuffle_identity: shuffle_swap;
+			shufflez = (idir.z >= 0)? shuffle_identity: shuffle_swap;
+#endif
+
 			object = ~0;
 			nodeAddr = traversalStack[stackPtr];
 			--stackPtr;
--- a/intern/cycles/kernel/kernel_sse3.cpp
+++ b/intern/cycles/kernel/kernel_sse3.cpp
@@ -22,6 +22,8 @@

 #ifdef WITH_OPTIMIZED_KERNEL

+#define __KERNEL_SSE3__
+
 #include "kernel.h"
 #include "kernel_compat_cpu.h"
 #include "kernel_math.h"
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -72,13 +72,21 @@
 #include <tmmintrin.h> /* SSE 3 */
 #include <smmintrin.h> /* SSE 4 */

+#ifndef __KERNEL_SSE2__
 #define __KERNEL_SSE2__
+#endif
+
+#ifndef __KERNEL_SSE3__
 #define __KERNEL_SSE3__
+#endif
+
+#ifndef __KERNEL_SSE4__
 #define __KERNEL_SSE4__
+#endif

 #else

-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(__KERNEL_SSE3__)

 /* MinGW64 has conflicting declarations for these SSE headers in <windows.h>.
 * Since we can't avoid including <windows.h>, better only include that */
@@ -87,9 +95,16 @@
 #else
 #include <xmmintrin.h> /* SSE 1 */
 #include <emmintrin.h> /* SSE 2 */
+
+#ifdef __KERNEL_SSE3__
+#include <pmmintrin.h> /* SSE 3 */
+#include <tmmintrin.h> /* SSE 3 */
+#endif
 #endif

+#ifndef __KERNEL_SSE2__
 #define __KERNEL_SSE2__
+#endif

 #endif

@@ -471,6 +486,46 @@ __device_inline int4 make_int4(const float3& f)

 #endif

+#ifdef __KERNEL_SSE3__
+
+/* SSE shuffle utility functions */
+
+__device_inline const __m128 shuffle8(const __m128& a, const __m128i& shuf)
+{
+	return _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(a), shuf));
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3> __device_inline const __m128 shuffle(const __m128& a, const __m128& b)
+{
+	return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3> __device_inline const __m128 shuffle(const __m128& b)
+{
+	return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+}
+#endif
+
+#if defined(__KERNEL_SSE2__) && defined(_MSC_VER)
+
+/* count zeros from start or end of integer bits */
+
+__device_inline uint32_t __builtin_ctz(uint32_t i)
+{
+	unsigned long r = 0;
+	_BitScanForward(&r, i);
+	return (uint32_t)r;
+}
+
+__device_inline uint32_t __builtin_clz(uint32_t i)
+{
+	unsigned long r = 0;
+	_BitScanReverse(&r, i);
+	return (uint32_t)r;
+}
+
+#endif
+
 CCL_NAMESPACE_END

 #endif /* __UTIL_TYPES_H__ */