Cycles: optimization for BVH traveral on CPU's with SSE3, using code from Embree.

On the BMW scene, this gives roughly a 10% speedup overall with clang/gcc, and 30%
speedup with visual studio (2008). It turns out visual studio was optimizing the
existing code quite poorly compared to pretty good autovectorization by clang/gcc,
but hand written SSE code also gives a smaller speed boost there.

This code isn't enabled when using the hair minimum width feature yet, need to
make that work with the SSE code still.
This commit is contained in:
Brecht Van Lommel
2013-06-18 09:36:06 +00:00
parent 9131adca9f
commit d57c6748c4
5 changed files with 278 additions and 127 deletions

View File

@@ -552,9 +552,9 @@ void RegularBVH::pack_node(int idx, const BoundBox& b0, const BoundBox& b1, int
{
int4 data[BVH_NODE_SIZE] =
{
make_int4(__float_as_int(b0.min.x), __float_as_int(b0.max.x), __float_as_int(b0.min.y), __float_as_int(b0.max.y)),
make_int4(__float_as_int(b1.min.x), __float_as_int(b1.max.x), __float_as_int(b1.min.y), __float_as_int(b1.max.y)),
make_int4(__float_as_int(b0.min.z), __float_as_int(b0.max.z), __float_as_int(b1.min.z), __float_as_int(b1.max.z)),
make_int4(__float_as_int(b0.min.x), __float_as_int(b1.min.x), __float_as_int(b0.max.x), __float_as_int(b1.max.x)),
make_int4(__float_as_int(b0.min.y), __float_as_int(b1.min.y), __float_as_int(b0.max.y), __float_as_int(b1.max.y)),
make_int4(__float_as_int(b0.min.z), __float_as_int(b1.min.z), __float_as_int(b0.max.z), __float_as_int(b1.max.z)),
make_int4(c0, c1, visibility0, visibility1)
};

View File

@@ -112,80 +112,8 @@ __device_inline void bvh_instance_motion_pop(KernelGlobals *kg, int object, cons
}
#endif
/* intersect two bounding boxes */
#ifdef __HAIR__
__device_inline void bvh_node_intersect(KernelGlobals *kg,
bool *traverseChild0, bool *traverseChild1,
bool *closestChild1, int *nodeAddr0, int *nodeAddr1,
float3 P, float3 idir, float t, uint visibility, int nodeAddr, float difl, float extmax)
{
#else
__device_inline void bvh_node_intersect(KernelGlobals *kg,
bool *traverseChild0, bool *traverseChild1,
bool *closestChild1, int *nodeAddr0, int *nodeAddr1,
float3 P, float3 idir, float t, uint visibility, int nodeAddr)
{
#endif
/* fetch node data */
float4 n0xy = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0);
float4 n1xy = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1);
float4 nz = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2);
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3);
/* intersect ray against child nodes */
float3 ood = P * idir;
NO_EXTENDED_PRECISION float c0lox = n0xy.x * idir.x - ood.x;
NO_EXTENDED_PRECISION float c0hix = n0xy.y * idir.x - ood.x;
NO_EXTENDED_PRECISION float c0loy = n0xy.z * idir.y - ood.y;
NO_EXTENDED_PRECISION float c0hiy = n0xy.w * idir.y - ood.y;
NO_EXTENDED_PRECISION float c0loz = nz.x * idir.z - ood.z;
NO_EXTENDED_PRECISION float c0hiz = nz.y * idir.z - ood.z;
NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
NO_EXTENDED_PRECISION float c1loz = nz.z * idir.z - ood.z;
NO_EXTENDED_PRECISION float c1hiz = nz.w * idir.z - ood.z;
NO_EXTENDED_PRECISION float c1lox = n1xy.x * idir.x - ood.x;
NO_EXTENDED_PRECISION float c1hix = n1xy.y * idir.x - ood.x;
NO_EXTENDED_PRECISION float c1loy = n1xy.z * idir.y - ood.y;
NO_EXTENDED_PRECISION float c1hiy = n1xy.w * idir.y - ood.y;
NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
#ifdef __HAIR__
if(difl != 0.0f) {
float hdiff = 1.0f + difl;
float ldiff = 1.0f - difl;
if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) {
c0min = max(ldiff * c0min, c0min - extmax);
c0max = min(hdiff * c0max, c0max + extmax);
}
if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) {
c1min = max(ldiff * c1min, c1min - extmax);
c1max = min(hdiff * c1max, c1max + extmax);
}
}
#endif
/* decide which nodes to traverse next */
#ifdef __VISIBILITY_FLAG__
/* this visibility test gives a 5% performance hit, how to solve? */
*traverseChild0 = (c0max >= c0min) && (__float_as_uint(cnodes.z) & visibility);
*traverseChild1 = (c1max >= c1min) && (__float_as_uint(cnodes.w) & visibility);
#else
*traverseChild0 = (c0max >= c0min);
*traverseChild1 = (c1max >= c1min);
#endif
*nodeAddr0 = __float_as_int(cnodes.x);
*nodeAddr1 = __float_as_int(cnodes.y);
*closestChild1 = (c1min < c0min);
}
/* Sven Woop's algorithm */
__device_inline void bvh_triangle_intersect(KernelGlobals *kg, Intersection *isect,
__device_inline bool bvh_triangle_intersect(KernelGlobals *kg, Intersection *isect,
float3 P, float3 idir, uint visibility, int object, int triAddr)
{
/* compute and check intersection t-value */
@@ -223,10 +151,13 @@ __device_inline void bvh_triangle_intersect(KernelGlobals *kg, Intersection *ise
isect->u = u;
isect->v = v;
isect->t = t;
return true;
}
}
}
}
return false;
}
#ifdef __HAIR__
@@ -280,7 +211,7 @@ __device_inline void curvebounds(float *lower, float *upper, float *extremta, fl
}
}
__device_inline void bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
__device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
float3 P, float3 idir, uint visibility, int object, int curveAddr, int segment, uint *lcg_state, float difl, float extmax)
{
float epsilon = 0.0f;
@@ -346,7 +277,7 @@ __device_inline void bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersectio
float zextrem[4];
curvebounds(&lower, &upper, &zextrem[0], &zextrem[1], &zextrem[2], &zextrem[3], curve_coef[0].z, curve_coef[1].z, curve_coef[2].z, curve_coef[3].z);
if(lower - r_curr > isect->t || upper + r_curr < epsilon)
return;
return false;
/*minimum width extension*/
float mw_extension = min(difl * fabsf(upper), extmax);
@@ -355,17 +286,18 @@ __device_inline void bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersectio
float xextrem[4];
curvebounds(&lower, &upper, &xextrem[0], &xextrem[1], &xextrem[2], &xextrem[3], curve_coef[0].x, curve_coef[1].x, curve_coef[2].x, curve_coef[3].x);
if(lower > r_ext || upper < -r_ext)
return;
return false;
float yextrem[4];
curvebounds(&lower, &upper, &yextrem[0], &yextrem[1], &yextrem[2], &yextrem[3], curve_coef[0].y, curve_coef[1].y, curve_coef[2].y, curve_coef[3].y);
if(lower > r_ext || upper < -r_ext)
return;
return false;
/*setup recurrent loop*/
int level = 1 << depth;
int tree = 0;
float resol = 1.0f / (float)level;
bool hit = false;
/*begin loop*/
while(!(tree >> (depth))) {
@@ -557,7 +489,7 @@ __device_inline void bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersectio
/*stochastic fade from minimum width*/
if(lcg_state && coverage != 1.0f) {
if(lcg_step(lcg_state) > coverage)
return;
return hit;
}
#ifdef __VISIBILITY_FLAG__
@@ -574,6 +506,7 @@ __device_inline void bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersectio
isect->v = 0.0f;
/*isect->v = 1.0f - coverage; */
isect->t = t;
hit = true;
}
tree++;
@@ -584,9 +517,11 @@ __device_inline void bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersectio
level = level >> 1;
}
}
return hit;
}
__device_inline void bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
__device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
float3 P, float3 idir, uint visibility, int object, int curveAddr, int segment, uint *lcg_state, float difl, float extmax)
{
/* curve Intersection check */
@@ -630,7 +565,7 @@ __device_inline void bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
sphere_b = dot(dir,sphere_dif);
float sdisc = sphere_b * sphere_b - len_squared(sphere_dif) + sp_r * sp_r;
if(sdisc < 0.0f)
return;
return false;
/* obtain parameters and test midpoint distance for suitable modes*/
float3 tg = (p2 - p1) / l;
@@ -645,9 +580,9 @@ __device_inline void bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
float zcentre = difz + (dirz * tcentre);
if((tcentre > isect->t) && !(flags & CURVE_KN_ACCURATE))
return;
return false;
if((zcentre < 0 || zcentre > l) && !(flags & CURVE_KN_ACCURATE) && !(flags & CURVE_KN_INTERSECTCORRECTION))
return;
return false;
/* test minimum separation*/
float3 cprod = cross(tg, dir);
@@ -662,7 +597,7 @@ __device_inline void bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
distscaled = (distscaled*distscaled)/cprodsq;
if(distscaled > mr*mr)
return;
return false;
/* calculate true intersection*/
float3 tdif = P - p1 + tcentre * dir;
@@ -672,7 +607,7 @@ __device_inline void bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
float td = tb*tb - 4*a*tc;
if (td < 0.0f)
return;
return false;
float rootd = 0.0f;
float correction = 0.0f;
@@ -706,7 +641,7 @@ __device_inline void bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
adjradius = adjradius / (r1 + z * gd);
if(lcg_state && adjradius != 1.0f) {
if(lcg_step(lcg_state) > adjradius)
return;
return false;
}
/* --- */
@@ -719,7 +654,7 @@ __device_inline void bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
float a2 = 1.0f - (dirz*dirz*(1 + gd*gd*enc_ratio*enc_ratio));
float c2 = dot(dif,dif) - difz * difz * (1 + gd*gd*enc_ratio*enc_ratio) - r1*r1*enc_ratio*enc_ratio - 2*r1*difz*gd*enc_ratio;
if(a2*c2 < 0.0f)
return;
return false;
}
}
@@ -740,9 +675,13 @@ __device_inline void bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
if(backface)
isect->u = -isect->u;
return true;
}
}
}
return false;
}
#endif
@@ -751,7 +690,7 @@ __device_inline void bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
* only want to intersect with primitives in the same object, and if case of
* multiple hits we pick a single random primitive as the intersection point. */
__device_inline void bvh_triangle_intersect_subsurface(KernelGlobals *kg, Intersection *isect,
__device_inline bool bvh_triangle_intersect_subsurface(KernelGlobals *kg, Intersection *isect,
float3 P, float3 idir, int object, int triAddr, float tmax, int *num_hits, float subsurface_random)
{
/* compute and check intersection t-value */
@@ -786,10 +725,13 @@ __device_inline void bvh_triangle_intersect_subsurface(KernelGlobals *kg, Inters
isect->u = u;
isect->v = v;
isect->t = t;
return true;
}
}
}
}
return false;
}
#endif

View File

@@ -1,6 +1,8 @@
/*
* Adapted from code Copyright 2009-2010 NVIDIA Corporation
* Modifications Copyright 2011, Blender Foundation.
* Adapted from code Copyright 2009-2010 NVIDIA Corporation,
* and code copyright 2009-2012 Intel Corporation
*
* Modifications Copyright 2011-2013, Blender Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -41,6 +43,14 @@ __device bool BVH_FUNCTION_NAME
#endif
)
{
/* todo:
* - test if pushing distance on the stack helps (for non shadow rays)
* - separate version for shadow rays
* - likely and unlikely for if() statements
* - SSE for hair
* - test restrict attribute for pointers
*/
/* traversal stack in CUDA thread-local memory */
int traversalStack[BVH_STACK_SIZE];
traversalStack[0] = ENTRYPOINT_SENTINEL;
@@ -70,6 +80,28 @@ __device bool BVH_FUNCTION_NAME
isect->u = 0.0f;
isect->v = 0.0f;
#if defined(__KERNEL_SSE3__) && !FEATURE(BVH_HAIR_MINIMUM_WIDTH)
const __m128i shuffle_identity = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
const __m128i shuffle_swap = _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
const __m128i pn = _mm_set_epi32(0x80000000, 0x80000000, 0x00000000, 0x00000000);
__m128 Psplat[3], idirsplat[3];
Psplat[0] = _mm_set_ps1(P.x);
Psplat[1] = _mm_set_ps1(P.y);
Psplat[2] = _mm_set_ps1(P.z);
idirsplat[0] = _mm_xor_ps(_mm_set_ps1(idir.x), _mm_castsi128_ps(pn));
idirsplat[1] = _mm_xor_ps(_mm_set_ps1(idir.y), _mm_castsi128_ps(pn));
idirsplat[2] = _mm_xor_ps(_mm_set_ps1(idir.z), _mm_castsi128_ps(pn));
__m128 tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f);
__m128i shufflex = (idir.x >= 0)? shuffle_identity: shuffle_swap;
__m128i shuffley = (idir.y >= 0)? shuffle_identity: shuffle_swap;
__m128i shufflez = (idir.z >= 0)? shuffle_identity: shuffle_swap;
#endif
/* traversal loop */
do {
do
@@ -77,46 +109,121 @@ __device bool BVH_FUNCTION_NAME
/* traverse internal nodes */
while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL)
{
bool traverseChild0, traverseChild1, closestChild1;
bool traverseChild0, traverseChild1;
int nodeAddrChild1;
float t = isect->t;
#if !defined(__KERNEL_SSE3__) || FEATURE(BVH_HAIR_MINIMUM_WIDTH)
/* Intersect two child bounding boxes, non-SSE version */
/* fetch node data */
float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0);
float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1);
float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2);
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3);
/* intersect ray against child nodes */
NO_EXTENDED_PRECISION float c0lox = (node0.x - P.x) * idir.x;
NO_EXTENDED_PRECISION float c0hix = (node0.z - P.x) * idir.x;
NO_EXTENDED_PRECISION float c0loy = (node1.x - P.y) * idir.y;
NO_EXTENDED_PRECISION float c0hiy = (node1.z - P.y) * idir.y;
NO_EXTENDED_PRECISION float c0loz = (node2.x - P.z) * idir.z;
NO_EXTENDED_PRECISION float c0hiz = (node2.z - P.z) * idir.z;
NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
NO_EXTENDED_PRECISION float c1lox = (node0.y - P.x) * idir.x;
NO_EXTENDED_PRECISION float c1hix = (node0.w - P.x) * idir.x;
NO_EXTENDED_PRECISION float c1loy = (node1.y - P.y) * idir.y;
NO_EXTENDED_PRECISION float c1hiy = (node1.w - P.y) * idir.y;
NO_EXTENDED_PRECISION float c1loz = (node2.y - P.z) * idir.z;
NO_EXTENDED_PRECISION float c1hiz = (node2.w - P.z) * idir.z;
NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
#if FEATURE(BVH_HAIR_MINIMUM_WIDTH) && !FEATURE(BVH_SUBSURFACE)
bvh_node_intersect(kg, &traverseChild0, &traverseChild1,
&closestChild1, &nodeAddr, &nodeAddrChild1,
P, idir, isect->t, visibility, nodeAddr, difl, extmax);
#else
bvh_node_intersect(kg, &traverseChild0, &traverseChild1,
&closestChild1, &nodeAddr, &nodeAddrChild1,
#ifdef __HAIR__
P, idir, isect->t, visibility, nodeAddr, 0.0f, 0.0f);
#else
P, idir, isect->t, visibility, nodeAddr);
#endif
if(difl != 0.0f) {
float hdiff = 1.0f + difl;
float ldiff = 1.0f - difl;
if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) {
c0min = max(ldiff * c0min, c0min - extmax);
c0max = min(hdiff * c0max, c0max + extmax);
}
if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) {
c1min = max(ldiff * c1min, c1min - extmax);
c1max = min(hdiff * c1max, c1max + extmax);
}
}
#endif
if(traverseChild0 != traverseChild1) {
/* decide which nodes to traverse next */
#ifdef __VISIBILITY_FLAG__
/* this visibility test gives a 5% performance hit, how to solve? */
traverseChild0 = (c0max >= c0min) && (__float_as_uint(cnodes.z) & visibility);
traverseChild1 = (c1max >= c1min) && (__float_as_uint(cnodes.w) & visibility);
#else
traverseChild0 = (c0max >= c0min);
traverseChild1 = (c1max >= c1min);
#endif
#else // __KERNEL_SSE3__
/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
/* fetch node data */
__m128 *bvh_nodes = (__m128*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
float4 cnodes = ((float4*)bvh_nodes)[3];
/* intersect ray against child nodes */
const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle8(bvh_nodes[0], shufflex), Psplat[0]), idirsplat[0]);
const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle8(bvh_nodes[1], shuffley), Psplat[1]), idirsplat[1]);
const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle8(bvh_nodes[2], shufflez), Psplat[2]), idirsplat[2]);
const __m128 tminmax = _mm_xor_ps(_mm_max_ps(_mm_max_ps(tminmaxx, tminmaxy), _mm_max_ps(tminmaxz, tsplat)), _mm_castsi128_ps(pn));
const __m128 lrhit = _mm_cmple_ps(tminmax, shuffle8(tminmax, shuffle_swap));
/* decide which nodes to traverse next */
#ifdef __VISIBILITY_FLAG__
/* this visibility test gives a 5% performance hit, how to solve? */
traverseChild0 = (_mm_movemask_ps(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility);
traverseChild1 = (_mm_movemask_ps(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility);
#else
traverseChild0 = (_mm_movemask_ps(lrhit) & 1);
traverseChild1 = (_mm_movemask_ps(lrhit) & 2);
#endif
#endif // __KERNEL_SSE3__
nodeAddr = __float_as_int(cnodes.x);
nodeAddrChild1 = __float_as_int(cnodes.y);
if(traverseChild0 && traverseChild1) {
/* both children were intersected, push the farther one */
#if !defined(__KERNEL_SSE3__) || FEATURE(BVH_HAIR_MINIMUM_WIDTH)
bool closestChild1 = (c1min < c0min);
#else
union { __m128 m128; float v[4]; } uminmax;
uminmax.m128 = tminmax;
bool closestChild1 = uminmax.v[1] < uminmax.v[0];
#endif
if(closestChild1) {
int tmp = nodeAddr;
nodeAddr = nodeAddrChild1;
nodeAddrChild1 = tmp;
}
++stackPtr;
traversalStack[stackPtr] = nodeAddrChild1;
}
else {
/* one child was intersected */
if(traverseChild1) {
nodeAddr = nodeAddrChild1;
}
}
else {
if(!traverseChild0) {
else if(!traverseChild0) {
/* neither child was intersected */
nodeAddr = traversalStack[stackPtr];
--stackPtr;
}
else {
/* both children were intersected, push the farther one */
if(closestChild1) {
int tmp = nodeAddr;
nodeAddr = nodeAddrChild1;
nodeAddrChild1 = tmp;
}
++stackPtr;
traversalStack[stackPtr] = nodeAddrChild1;
}
}
}
@@ -136,6 +243,7 @@ __device bool BVH_FUNCTION_NAME
/* primitive intersection */
while(primAddr < primAddr2) {
bool hit;
#if FEATURE(BVH_SUBSURFACE)
/* only primitives from the same object */
uint tri_object = (object == ~0)? kernel_tex_fetch(__prim_object, primAddr): object;
@@ -148,15 +256,16 @@ __device bool BVH_FUNCTION_NAME
uint segment = kernel_tex_fetch(__prim_segment, primAddr);
#if !FEATURE(BVH_SUBSURFACE)
if(segment != ~0) {
if(kernel_data.curve_kernel_data.curveflags & CURVE_KN_INTERPOLATE)
#if FEATURE(BVH_HAIR_MINIMUM_WIDTH)
bvh_cardinal_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment, lcg_state, difl, extmax);
hit = bvh_cardinal_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment, lcg_state, difl, extmax);
else
bvh_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment, lcg_state, difl, extmax);
hit = bvh_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment, lcg_state, difl, extmax);
#else
bvh_cardinal_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment);
hit = bvh_cardinal_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment);
else
bvh_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment);
hit = bvh_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment);
#endif
}
else
@@ -166,15 +275,25 @@ __device bool BVH_FUNCTION_NAME
#if FEATURE(BVH_HAIR)
if(segment == ~0)
#endif
bvh_triangle_intersect_subsurface(kg, isect, P, idir, object, primAddr, tmax, &num_hits, subsurface_random);
hit = bvh_triangle_intersect_subsurface(kg, isect, P, idir, object, primAddr, tmax, &num_hits, subsurface_random);
}
#else
bvh_triangle_intersect(kg, isect, P, idir, visibility, object, primAddr);
hit = bvh_triangle_intersect(kg, isect, P, idir, visibility, object, primAddr);
/* shadow ray early termination */
if(visibility == PATH_RAY_SHADOW_OPAQUE && isect->prim != ~0)
#if defined(__KERNEL_SSE3__) && !FEATURE(BVH_HAIR_MINIMUM_WIDTH)
if(hit) {
if(visibility == PATH_RAY_SHADOW_OPAQUE)
return true;
tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f);
}
#else
if(hit && visibility == PATH_RAY_SHADOW_OPAQUE)
return true;
#endif
#endif
primAddr++;
@@ -196,6 +315,22 @@ __device bool BVH_FUNCTION_NAME
bvh_instance_push(kg, object, ray, &P, &idir, &isect->t, tmax);
#endif
#if defined(__KERNEL_SSE3__) && !FEATURE(BVH_HAIR_MINIMUM_WIDTH)
Psplat[0] = _mm_set_ps1(P.x);
Psplat[1] = _mm_set_ps1(P.y);
Psplat[2] = _mm_set_ps1(P.z);
idirsplat[0] = _mm_xor_ps(_mm_set_ps1(idir.x), _mm_castsi128_ps(pn));
idirsplat[1] = _mm_xor_ps(_mm_set_ps1(idir.y), _mm_castsi128_ps(pn));
idirsplat[2] = _mm_xor_ps(_mm_set_ps1(idir.z), _mm_castsi128_ps(pn));
tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f);
shufflex = (idir.x >= 0)? shuffle_identity: shuffle_swap;
shuffley = (idir.y >= 0)? shuffle_identity: shuffle_swap;
shufflez = (idir.z >= 0)? shuffle_identity: shuffle_swap;
#endif
++stackPtr;
traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
@@ -223,6 +358,23 @@ __device bool BVH_FUNCTION_NAME
#else
bvh_instance_pop(kg, object, ray, &P, &idir, &isect->t, tmax);
#endif
#if defined(__KERNEL_SSE3__) && !FEATURE(BVH_HAIR_MINIMUM_WIDTH)
Psplat[0] = _mm_set_ps1(P.x);
Psplat[1] = _mm_set_ps1(P.y);
Psplat[2] = _mm_set_ps1(P.z);
idirsplat[0] = _mm_xor_ps(_mm_set_ps1(idir.x), _mm_castsi128_ps(pn));
idirsplat[1] = _mm_xor_ps(_mm_set_ps1(idir.y), _mm_castsi128_ps(pn));
idirsplat[2] = _mm_xor_ps(_mm_set_ps1(idir.z), _mm_castsi128_ps(pn));
tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f);
shufflex = (idir.x >= 0)? shuffle_identity: shuffle_swap;
shuffley = (idir.y >= 0)? shuffle_identity: shuffle_swap;
shufflez = (idir.z >= 0)? shuffle_identity: shuffle_swap;
#endif
object = ~0;
nodeAddr = traversalStack[stackPtr];
--stackPtr;

View File

@@ -22,6 +22,8 @@
#ifdef WITH_OPTIMIZED_KERNEL
#define __KERNEL_SSE3__
#include "kernel.h"
#include "kernel_compat_cpu.h"
#include "kernel_math.h"

View File

@@ -72,13 +72,21 @@
#include <tmmintrin.h> /* SSE 3 */
#include <smmintrin.h> /* SSE 4 */
#ifndef __KERNEL_SSE2__
#define __KERNEL_SSE2__
#endif
#ifndef __KERNEL_SSE3__
#define __KERNEL_SSE3__
#endif
#ifndef __KERNEL_SSE4__
#define __KERNEL_SSE4__
#endif
#else
#ifdef __x86_64__
#if defined(__x86_64__) || defined(__KERNEL_SSE3__)
/* MinGW64 has conflicting declarations for these SSE headers in <windows.h>.
* Since we can't avoid including <windows.h>, better only include that */
@@ -87,9 +95,16 @@
#else
#include <xmmintrin.h> /* SSE 1 */
#include <emmintrin.h> /* SSE 2 */
#ifdef __KERNEL_SSE3__
#include <pmmintrin.h> /* SSE 3 */
#include <tmmintrin.h> /* SSE 3 */
#endif
#endif
#ifndef __KERNEL_SSE2__
#define __KERNEL_SSE2__
#endif
#endif
@@ -471,6 +486,46 @@ __device_inline int4 make_int4(const float3& f)
#endif
#ifdef __KERNEL_SSE3__
/* SSE shuffle utility functions */
__device_inline const __m128 shuffle8(const __m128& a, const __m128i& shuf)
{
return _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(a), shuf));
}
template<size_t i0, size_t i1, size_t i2, size_t i3> __device_inline const __m128 shuffle(const __m128& a, const __m128& b)
{
return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
}
template<size_t i0, size_t i1, size_t i2, size_t i3> __device_inline const __m128 shuffle(const __m128& b)
{
return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(i3, i2, i1, i0)));
}
#endif
#if defined(__KERNEL_SSE2__) && defined(_MSC_VER)
/* count zeros from start or end of integer bits */
__device_inline uint32_t __builtin_ctz(uint32_t i)
{
unsigned long r = 0;
_BitScanForward(&r, i);
return (uint32_t)r;
}
__device_inline uint32_t __builtin_clz(uint32_t i)
{
unsigned long r = 0;
_BitScanReverse(&r, i);
return (uint32_t)r;
}
#endif
CCL_NAMESPACE_END
#endif /* __UTIL_TYPES_H__ */