Cycles: avoid 1.0f/(1.0f/x) divisions, which msvc (only) can't optimize.

This makes bmw scene in msvc 12 builds 6% faster.
It also gives a minor speedup for SSE hair in all compilers.
This commit is contained in:
Sv. Lockal
2014-04-03 22:08:53 +04:00
parent 5e5ec4c138
commit e7c2578576
6 changed files with 55 additions and 62 deletions

View File

@@ -50,7 +50,8 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
/* ray parameters in registers */ /* ray parameters in registers */
const float tmax = ray->t; const float tmax = ray->t;
float3 P = ray->P; float3 P = ray->P;
float3 idir = bvh_inverse_direction(ray->D); float3 dir = bvh_clamp_direction(ray->D);
float3 idir = bvh_inverse_direction(dir);
int object = OBJECT_NONE; int object = OBJECT_NONE;
float isect_t = tmax; float isect_t = tmax;
@@ -215,11 +216,11 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
switch(type & PRIMITIVE_ALL) { switch(type & PRIMITIVE_ALL) {
case PRIMITIVE_TRIANGLE: { case PRIMITIVE_TRIANGLE: {
triangle_intersect_subsurface(kg, isect_array, P, idir, object, primAddr, isect_t, &num_hits, lcg_state, max_hits); triangle_intersect_subsurface(kg, isect_array, P, dir, object, primAddr, isect_t, &num_hits, lcg_state, max_hits);
break; break;
} }
case PRIMITIVE_MOTION_TRIANGLE: { case PRIMITIVE_MOTION_TRIANGLE: {
motion_triangle_intersect_subsurface(kg, isect_array, P, idir, ray->time, object, primAddr, isect_t, &num_hits, lcg_state, max_hits); motion_triangle_intersect_subsurface(kg, isect_array, P, dir, ray->time, object, primAddr, isect_t, &num_hits, lcg_state, max_hits);
break; break;
} }
default: { default: {
@@ -235,9 +236,9 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
object = subsurface_object; object = subsurface_object;
#if FEATURE(BVH_MOTION) #if FEATURE(BVH_MOTION)
bvh_instance_motion_push(kg, object, ray, &P, &idir, &isect_t, &ob_tfm, tmax); bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_tfm, tmax);
#else #else
bvh_instance_push(kg, object, ray, &P, &idir, &isect_t, tmax); bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t, tmax);
#endif #endif
#if defined(__KERNEL_SSE2__) #if defined(__KERNEL_SSE2__)
@@ -271,9 +272,9 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
/* instance pop */ /* instance pop */
#if FEATURE(BVH_MOTION) #if FEATURE(BVH_MOTION)
bvh_instance_motion_pop(kg, object, ray, &P, &idir, &isect_t, &ob_tfm, tmax); bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_tfm, tmax);
#else #else
bvh_instance_pop(kg, object, ray, &P, &idir, &isect_t, tmax); bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect_t, tmax);
#endif #endif
#if defined(__KERNEL_SSE2__) #if defined(__KERNEL_SSE2__)

View File

@@ -55,7 +55,8 @@ ccl_device bool BVH_FUNCTION_NAME
/* ray parameters in registers */ /* ray parameters in registers */
const float tmax = ray->t; const float tmax = ray->t;
float3 P = ray->P; float3 P = ray->P;
float3 idir = bvh_inverse_direction(ray->D); float3 dir = bvh_clamp_direction(ray->D);
float3 idir = bvh_inverse_direction(dir);
int object = OBJECT_NONE; int object = OBJECT_NONE;
#if FEATURE(BVH_MOTION) #if FEATURE(BVH_MOTION)
@@ -253,11 +254,11 @@ ccl_device bool BVH_FUNCTION_NAME
switch(type & PRIMITIVE_ALL) { switch(type & PRIMITIVE_ALL) {
case PRIMITIVE_TRIANGLE: { case PRIMITIVE_TRIANGLE: {
hit = triangle_intersect(kg, isect, P, idir, visibility, object, primAddr); hit = triangle_intersect(kg, isect, P, dir, visibility, object, primAddr);
break; break;
} }
case PRIMITIVE_MOTION_TRIANGLE: { case PRIMITIVE_MOTION_TRIANGLE: {
hit = motion_triangle_intersect(kg, isect, P, idir, ray->time, visibility, object, primAddr); hit = motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, primAddr);
break; break;
} }
#if FEATURE(BVH_HAIR) #if FEATURE(BVH_HAIR)
@@ -265,14 +266,14 @@ ccl_device bool BVH_FUNCTION_NAME
case PRIMITIVE_MOTION_CURVE: { case PRIMITIVE_MOTION_CURVE: {
#if FEATURE(BVH_HAIR_MINIMUM_WIDTH) #if FEATURE(BVH_HAIR_MINIMUM_WIDTH)
if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE)
hit = bvh_cardinal_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax); hit = bvh_cardinal_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax);
else else
hit = bvh_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax); hit = bvh_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax);
#else #else
if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE)
hit = bvh_cardinal_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, ray->time, type); hit = bvh_cardinal_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type);
else else
hit = bvh_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, ray->time, type); hit = bvh_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type);
#endif #endif
break; break;
@@ -306,9 +307,9 @@ ccl_device bool BVH_FUNCTION_NAME
object = kernel_tex_fetch(__prim_object, -primAddr-1); object = kernel_tex_fetch(__prim_object, -primAddr-1);
#if FEATURE(BVH_MOTION) #if FEATURE(BVH_MOTION)
bvh_instance_motion_push(kg, object, ray, &P, &idir, &isect->t, &ob_tfm, tmax); bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_tfm, tmax);
#else #else
bvh_instance_push(kg, object, ray, &P, &idir, &isect->t, tmax); bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t, tmax);
#endif #endif
#if defined(__KERNEL_SSE2__) #if defined(__KERNEL_SSE2__)
@@ -336,9 +337,9 @@ ccl_device bool BVH_FUNCTION_NAME
/* instance pop */ /* instance pop */
#if FEATURE(BVH_MOTION) #if FEATURE(BVH_MOTION)
bvh_instance_motion_pop(kg, object, ray, &P, &idir, &isect->t, &ob_tfm, tmax); bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_tfm, tmax);
#else #else
bvh_instance_pop(kg, object, ray, &P, &idir, &isect->t, tmax); bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t, tmax);
#endif #endif
#if defined(__KERNEL_SSE2__) #if defined(__KERNEL_SSE2__)

View File

@@ -205,12 +205,12 @@ ccl_device_inline __m128 transform_point_T3(const __m128 t[3], const __m128 &a)
#endif #endif
#ifdef __KERNEL_SSE2__ #ifdef __KERNEL_SSE2__
/* Pass P and idir by reference to aligned vector */ /* Pass P and dir by reference to aligned vector */
ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect, ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
const float3 &P, const float3 &idir, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax) const float3 &P, const float3 &dir, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax)
#else #else
ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect, ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
float3 P, float3 idir, uint visibility, int object, int curveAddr, float time,int type, uint *lcg_state, float difl, float extmax) float3 P, float3 dir, uint visibility, int object, int curveAddr, float time,int type, uint *lcg_state, float difl, float extmax)
#endif #endif
{ {
int segment = PRIMITIVE_UNPACK_SEGMENT(type); int segment = PRIMITIVE_UNPACK_SEGMENT(type);
@@ -222,7 +222,7 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
int prim = kernel_tex_fetch(__prim_index, curveAddr); int prim = kernel_tex_fetch(__prim_index, curveAddr);
#ifdef __KERNEL_SSE2__ #ifdef __KERNEL_SSE2__
__m128 vdir = _mm_div_ps(_mm_set1_ps(1.0f), load_m128(idir)); __m128 vdir = load_m128(dir);
__m128 vcurve_coef[4]; __m128 vcurve_coef[4];
const float3 *curve_coef = (float3 *)vcurve_coef; const float3 *curve_coef = (float3 *)vcurve_coef;
@@ -285,8 +285,6 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
float3 curve_coef[4]; float3 curve_coef[4];
/* curve Intersection check */ /* curve Intersection check */
float3 dir = 1.0f/idir;
/* obtain curve parameters */ /* obtain curve parameters */
{ {
/* ray transform created - this should be created at beginning of intersection loop */ /* ray transform created - this should be created at beginning of intersection loop */
@@ -597,7 +595,7 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
} }
ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect, ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
float3 P, float3 idir, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax) float3 P, float3 direction, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax)
{ {
/* define few macros to minimize code duplication for SSE */ /* define few macros to minimize code duplication for SSE */
#ifndef __KERNEL_SSE2__ #ifndef __KERNEL_SSE2__
@@ -647,9 +645,9 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec
} }
/* --- */ /* --- */
float3 dir = 1.0f / idir;
float3 p21_diff = p2 - p1; float3 p21_diff = p2 - p1;
float3 sphere_dif1 = (dif + dif_second) * 0.5f; float3 sphere_dif1 = (dif + dif_second) * 0.5f;
float3 dir = direction;
float sphere_b_tmp = dot3(dir, sphere_dif1); float sphere_b_tmp = dot3(dir, sphere_dif1);
float3 sphere_dif2 = sphere_dif1 - sphere_b_tmp * dir; float3 sphere_dif2 = sphere_dif1 - sphere_b_tmp * dir;
#else #else
@@ -680,9 +678,9 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec
float or1 = _mm_cvtss_f32(or12), or2 = _mm_cvtss_f32(broadcast<2>(or12)); float or1 = _mm_cvtss_f32(or12), or2 = _mm_cvtss_f32(broadcast<2>(or12));
float r1 = _mm_cvtss_f32(r12), r2 = _mm_cvtss_f32(broadcast<2>(r12)); float r1 = _mm_cvtss_f32(r12), r2 = _mm_cvtss_f32(broadcast<2>(r12));
const __m128 dir = _mm_div_ps(_mm_set1_ps(1.0f), load_m128(idir));
const __m128 p21_diff = _mm_sub_ps(P_curve[1], P_curve[0]); const __m128 p21_diff = _mm_sub_ps(P_curve[1], P_curve[0]);
const __m128 sphere_dif1 = _mm_mul_ps(_mm_add_ps(dif, dif_second), _mm_set1_ps(0.5f)); const __m128 sphere_dif1 = _mm_mul_ps(_mm_add_ps(dif, dif_second), _mm_set1_ps(0.5f));
const __m128 dir = load_m128(direction);
const __m128 sphere_b_tmp = dot3_splat(dir, sphere_dif1); const __m128 sphere_b_tmp = dot3_splat(dir, sphere_dif1);
const __m128 sphere_dif2 = fnma(sphere_b_tmp, dir, sphere_dif1); const __m128 sphere_dif2 = fnma(sphere_b_tmp, dir, sphere_dif1);
#endif #endif

View File

@@ -313,7 +313,7 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, ShaderD
* time and do a ray intersection with the resulting triangle */ * time and do a ray intersection with the resulting triangle */
ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg, Intersection *isect, ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg, Intersection *isect,
float3 P, float3 idir, float time, uint visibility, int object, int triAddr) float3 P, float3 dir, float time, uint visibility, int object, int triAddr)
{ {
/* primitive index for vertex location lookup */ /* primitive index for vertex location lookup */
int prim = kernel_tex_fetch(__prim_index, triAddr); int prim = kernel_tex_fetch(__prim_index, triAddr);
@@ -324,10 +324,9 @@ ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg, Intersection
motion_triangle_vertices(kg, fobject, prim, time, verts); motion_triangle_vertices(kg, fobject, prim, time, verts);
/* ray-triangle intersection, unoptimized */ /* ray-triangle intersection, unoptimized */
float3 D = 1.0f/idir;
float t, u, v; float t, u, v;
if(ray_triangle_intersect_uv(P, D, isect->t, verts[2], verts[0], verts[1], &u, &v, &t)) { if(ray_triangle_intersect_uv(P, dir, isect->t, verts[2], verts[0], verts[1], &u, &v, &t)) {
isect->prim = triAddr; isect->prim = triAddr;
isect->object = object; isect->object = object;
isect->type = PRIMITIVE_MOTION_TRIANGLE; isect->type = PRIMITIVE_MOTION_TRIANGLE;
@@ -347,7 +346,7 @@ ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg, Intersection
#ifdef __SUBSURFACE__ #ifdef __SUBSURFACE__
ccl_device_inline void motion_triangle_intersect_subsurface(KernelGlobals *kg, Intersection *isect_array, ccl_device_inline void motion_triangle_intersect_subsurface(KernelGlobals *kg, Intersection *isect_array,
float3 P, float3 idir, float time, int object, int triAddr, float tmax, uint *num_hits, uint *lcg_state, int max_hits) float3 P, float3 dir, float time, int object, int triAddr, float tmax, uint *num_hits, uint *lcg_state, int max_hits)
{ {
/* primitive index for vertex location lookup */ /* primitive index for vertex location lookup */
int prim = kernel_tex_fetch(__prim_index, triAddr); int prim = kernel_tex_fetch(__prim_index, triAddr);
@@ -358,10 +357,9 @@ ccl_device_inline void motion_triangle_intersect_subsurface(KernelGlobals *kg, I
motion_triangle_vertices(kg, fobject, prim, time, verts); motion_triangle_vertices(kg, fobject, prim, time, verts);
/* ray-triangle intersection, unoptimized */ /* ray-triangle intersection, unoptimized */
float3 D = 1.0f/idir;
float t, u, v; float t, u, v;
if(ray_triangle_intersect_uv(P, D, tmax, verts[2], verts[0], verts[1], &u, &v, &t)) { if(ray_triangle_intersect_uv(P, dir, tmax, verts[2], verts[0], verts[1], &u, &v, &t)) {
(*num_hits)++; (*num_hits)++;
int hit; int hit;

View File

@@ -361,33 +361,31 @@ ccl_device float3 particle_angular_velocity(KernelGlobals *kg, int particle)
/* Object intersection in BVH */ /* Object intersection in BVH */
ccl_device_inline float3 bvh_clamp_direction(float3 dir)
{
/* clamp absolute values by exp2f(-80.0f) to avoid division by zero when calculating inverse direction */
float ooeps = 8.271806E-25;
return make_float3((fabsf(dir.x) > ooeps)? dir.x: copysignf(ooeps, dir.x),
(fabsf(dir.y) > ooeps)? dir.y: copysignf(ooeps, dir.y),
(fabsf(dir.z) > ooeps)? dir.z: copysignf(ooeps, dir.z));
}
ccl_device_inline float3 bvh_inverse_direction(float3 dir) ccl_device_inline float3 bvh_inverse_direction(float3 dir)
{ {
/* avoid divide by zero (ooeps = exp2f(-80.0f)) */ return 1.0f / dir;
float ooeps = 0.00000000000000000000000082718061255302767487140869206996285356581211090087890625f;
float3 idir;
idir.x = 1.0f/((fabsf(dir.x) > ooeps)? dir.x: copysignf(ooeps, dir.x));
idir.y = 1.0f/((fabsf(dir.y) > ooeps)? dir.y: copysignf(ooeps, dir.y));
idir.z = 1.0f/((fabsf(dir.z) > ooeps)? dir.z: copysignf(ooeps, dir.z));
return idir;
} }
/* Transform ray into object space to enter static object in BVH */ /* Transform ray into object space to enter static object in BVH */
ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, const float tmax) ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t, const float tmax)
{ {
Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM); Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
*P = transform_point(&tfm, ray->P); *P = transform_point(&tfm, ray->P);
float3 dir = transform_direction(&tfm, ray->D);
float len; float len;
dir = normalize_len(dir, &len); *dir = bvh_clamp_direction(normalize_len(transform_direction(&tfm, ray->D), &len));
*idir = bvh_inverse_direction(*dir);
*idir = bvh_inverse_direction(dir);
if(*t != FLT_MAX) if(*t != FLT_MAX)
*t *= len; *t *= len;
@@ -395,7 +393,7 @@ ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ra
/* Transorm ray to exit static object in BVH */ /* Transorm ray to exit static object in BVH */
ccl_device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, const float tmax) ccl_device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t, const float tmax)
{ {
if(*t != FLT_MAX) { if(*t != FLT_MAX) {
Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM); Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
@@ -403,25 +401,23 @@ ccl_device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray
} }
*P = ray->P; *P = ray->P;
*idir = bvh_inverse_direction(ray->D); *dir = bvh_clamp_direction(ray->D);
*idir = bvh_inverse_direction(*dir);
} }
#ifdef __OBJECT_MOTION__ #ifdef __OBJECT_MOTION__
/* Transform ray into object space to enter motion blurred object in BVH */ /* Transform ray into object space to enter motion blurred object in BVH */
ccl_device_inline void bvh_instance_motion_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, Transform *tfm, const float tmax) ccl_device_inline void bvh_instance_motion_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t, Transform *tfm, const float tmax)
{ {
Transform itfm; Transform itfm;
*tfm = object_fetch_transform_motion_test(kg, object, ray->time, &itfm); *tfm = object_fetch_transform_motion_test(kg, object, ray->time, &itfm);
*P = transform_point(&itfm, ray->P); *P = transform_point(&itfm, ray->P);
float3 dir = transform_direction(&itfm, ray->D);
float len; float len;
dir = normalize_len(dir, &len); *dir = bvh_clamp_direction(normalize_len(transform_direction(&itfm, ray->D), &len));
*idir = bvh_inverse_direction(*dir);
*idir = bvh_inverse_direction(dir);
if(*t != FLT_MAX) if(*t != FLT_MAX)
*t *= len; *t *= len;
@@ -429,13 +425,14 @@ ccl_device_inline void bvh_instance_motion_push(KernelGlobals *kg, int object, c
/* Transorm ray to exit motion blurred object in BVH */ /* Transorm ray to exit motion blurred object in BVH */
ccl_device_inline void bvh_instance_motion_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, Transform *tfm, const float tmax) ccl_device_inline void bvh_instance_motion_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t, Transform *tfm, const float tmax)
{ {
if(*t != FLT_MAX) if(*t != FLT_MAX)
*t *= len(transform_direction(tfm, 1.0f/(*idir))); *t *= len(transform_direction(tfm, 1.0f/(*idir)));
*P = ray->P; *P = ray->P;
*idir = bvh_inverse_direction(ray->D); *dir = bvh_clamp_direction(ray->D);
*idir = bvh_inverse_direction(*dir);
} }
#endif #endif

View File

@@ -269,12 +269,11 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData
* Based on Sven Woop's algorithm with precomputed triangle storage */ * Based on Sven Woop's algorithm with precomputed triangle storage */
ccl_device_inline bool triangle_intersect(KernelGlobals *kg, Intersection *isect, ccl_device_inline bool triangle_intersect(KernelGlobals *kg, Intersection *isect,
float3 P, float3 idir, uint visibility, int object, int triAddr) float3 P, float3 dir, uint visibility, int object, int triAddr)
{ {
/* compute and check intersection t-value */ /* compute and check intersection t-value */
float4 v00 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+0); float4 v00 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+0);
float4 v11 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+1); float4 v11 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+1);
float3 dir = 1.0f/idir;
float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z; float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z;
float invDz = 1.0f/(dir.x*v00.x + dir.y*v00.y + dir.z*v00.z); float invDz = 1.0f/(dir.x*v00.x + dir.y*v00.y + dir.z*v00.z);
@@ -322,12 +321,11 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg, Intersection *isect
#ifdef __SUBSURFACE__ #ifdef __SUBSURFACE__
ccl_device_inline void triangle_intersect_subsurface(KernelGlobals *kg, Intersection *isect_array, ccl_device_inline void triangle_intersect_subsurface(KernelGlobals *kg, Intersection *isect_array,
float3 P, float3 idir, int object, int triAddr, float tmax, uint *num_hits, uint *lcg_state, int max_hits) float3 P, float3 dir, int object, int triAddr, float tmax, uint *num_hits, uint *lcg_state, int max_hits)
{ {
/* compute and check intersection t-value */ /* compute and check intersection t-value */
float4 v00 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+0); float4 v00 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+0);
float4 v11 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+1); float4 v11 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+1);
float3 dir = 1.0f/idir;
float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z; float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z;
float invDz = 1.0f/(dir.x*v00.x + dir.y*v00.y + dir.z*v00.z); float invDz = 1.0f/(dir.x*v00.x + dir.y*v00.y + dir.z*v00.z);