diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h index 89a882d9b9d..c98407b1f77 100644 --- a/intern/cycles/util/util_math.h +++ b/intern/cycles/util/util_math.h @@ -424,53 +424,87 @@ ccl_device_inline float2 interp(float2 a, float2 b, float t) #ifndef __KERNEL_OPENCL__ -ccl_device_inline float3 operator-(const float3 a) +ccl_device_inline float3 operator-(const float3& a) { +#ifdef __KERNEL_SSE__ + return float3(_mm_xor_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000)))); +#else return make_float3(-a.x, -a.y, -a.z); +#endif } -ccl_device_inline float3 operator*(const float3 a, const float3 b) +ccl_device_inline float3 operator*(const float3& a, const float3& b) { +#ifdef __KERNEL_SSE__ + return float3(_mm_mul_ps(a.m128,b.m128)); +#else return make_float3(a.x*b.x, a.y*b.y, a.z*b.z); +#endif } -ccl_device_inline float3 operator*(const float3 a, float f) +ccl_device_inline float3 operator*(const float3& a, const float f) { +#ifdef __KERNEL_SSE__ + return float3(_mm_mul_ps(a.m128,_mm_set1_ps(f))); +#else return make_float3(a.x*f, a.y*f, a.z*f); +#endif } -ccl_device_inline float3 operator*(float f, const float3 a) +ccl_device_inline float3 operator*(const float f, const float3& a) { +#ifdef __KERNEL_SSE__ + return float3(_mm_mul_ps(a.m128, _mm_set1_ps(f))); +#else return make_float3(a.x*f, a.y*f, a.z*f); +#endif } -ccl_device_inline float3 operator/(float f, const float3 a) +ccl_device_inline float3 operator/(const float f, const float3& a) { - return make_float3(f/a.x, f/a.y, f/a.z); +#ifdef __KERNEL_SSE__ + __m128 rc = _mm_rcp_ps(a.m128); + return float3(_mm_mul_ps(_mm_set1_ps(f),rc)); +#else + return make_float3(f / a.x, f / a.y, f / a.z); +#endif } -ccl_device_inline float3 operator/(const float3 a, float f) +ccl_device_inline float3 operator/(const float3& a, const float f) { float invf = 1.0f/f; - return make_float3(a.x*invf, a.y*invf, a.z*invf); + return a * invf; } -ccl_device_inline float3 operator/(const float3 a, const float3 b) +ccl_device_inline float3 operator/(const float3& a, const float3& b) { - return make_float3(a.x/b.x, a.y/b.y, a.z/b.z); +#ifdef __KERNEL_SSE__ + __m128 rc = _mm_rcp_ps(b.m128); + return float3(_mm_mul_ps(a, rc)); +#else + return make_float3(a.x / b.x, a.y / b.y, a.z / b.z); +#endif } -ccl_device_inline float3 operator+(const float3 a, const float3 b) +ccl_device_inline float3 operator+(const float3& a, const float3& b) { - return make_float3(a.x+b.x, a.y+b.y, a.z+b.z); +#ifdef __KERNEL_SSE__ + return float3(_mm_add_ps(a.m128, b.m128)); +#else + return make_float3(a.x + b.x, a.y + b.y, a.z + b.z); +#endif } -ccl_device_inline float3 operator-(const float3 a, const float3 b) +ccl_device_inline float3 operator-(const float3& a, const float3& b) { - return make_float3(a.x-b.x, a.y-b.y, a.z-b.z); +#ifdef __KERNEL_SSE__ + return float3(_mm_sub_ps(a.m128, b.m128)); +#else + return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); +#endif } -ccl_device_inline float3 operator+=(float3& a, const float3 b) +ccl_device_inline float3 operator+=(float3& a, const float3& b) { return a = a + b; } @@ -505,6 +539,15 @@ ccl_device_inline float dot(const float3 a, const float3 b) #endif } +ccl_device_inline float dot_xy(const float3& a, const float3& b) +{ +#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) + return _mm_cvtss_f32(_mm_hadd_ps(_mm_mul_ps(a,b),b)); +#else + return a.x*b.x + a.y*b.y; +#endif +} + ccl_device_inline float dot(const float4 a, const float4 b) { #if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) @@ -563,13 +606,14 @@ ccl_device_inline float3 saturate3(float3 a) ccl_device_inline float3 normalize_len(const float3 a, float *t) { *t = len(a); - return a/(*t); + float x = 1.0f / *t; + return a*x; } ccl_device_inline float3 safe_normalize(const float3 a) { float t = len(a); - return (t != 0.0f)? a/t: a; + return (t != 0.0f)? a * (1.0f/t) : a; } ccl_device_inline float3 safe_normalize_len(const float3 a, float *t) diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h index 6af65f88a02..a000fae4bd6 100644 --- a/intern/cycles/util/util_types.h +++ b/intern/cycles/util/util_types.h @@ -174,6 +174,9 @@ struct ccl_try_align(16) int3 { __forceinline int3(const __m128i a) : m128(a) {} __forceinline operator const __m128i&(void) const { return m128; } __forceinline operator __m128i&(void) { return m128; } + + int3(const int3& a) { m128 = a.m128; } + int3& operator =(const int3& a) { m128 = a.m128; return *this; } #else int x, y, z, w; #endif @@ -193,6 +196,9 @@ struct ccl_try_align(16) int4 { __forceinline int4(const __m128i a) : m128(a) {} __forceinline operator const __m128i&(void) const { return m128; } __forceinline operator __m128i&(void) { return m128; } + + int4(const int4& a) : m128(a.m128) {} + int4& operator=(const int4& a) { m128 = a.m128; return *this; } #else int x, y, z, w; #endif @@ -237,9 +243,12 @@ struct ccl_try_align(16) float3 { }; __forceinline float3() {} - __forceinline float3(const __m128 a) : m128(a) {} + __forceinline float3(const __m128& a) : m128(a) {} __forceinline operator const __m128&(void) const { return m128; } __forceinline operator __m128&(void) { return m128; } + + __forceinline float3(const float3& a) : m128(a.m128) {} + __forceinline float3& operator =(const float3& a) { m128 = a.m128; return *this; } #else float x, y, z, w; #endif @@ -259,6 +268,10 @@ struct ccl_try_align(16) float4 { __forceinline float4(const __m128 a) : m128(a) {} __forceinline operator const __m128&(void) const { return m128; } __forceinline operator __m128&(void) { return m128; } + + __forceinline float4(const float4& a) : m128(a.m128) {} + __forceinline float4& operator =(const float4& a) { m128 = a.m128; return *this; } + #else float x, y, z, w; #endif