You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
3127 lines
93 KiB
3127 lines
93 KiB
//========= Copyright Valve Corporation, All rights reserved. ============// |
|
// |
|
// Purpose: - defines SIMD "structure of arrays" classes and functions. |
|
// |
|
//===========================================================================// |
|
#ifndef SSEMATH_H |
|
#define SSEMATH_H |
|
|
|
#if defined( _X360 ) |
|
#include <xboxmath.h> |
|
#elif defined(__arm__) || defined(__aarch64__) |
|
#include "sse2neon.h" |
|
#else |
|
#include <xmmintrin.h> |
|
#endif |
|
|
|
#include <mathlib/vector.h> |
|
#include <mathlib/mathlib.h> |
|
|
|
#if defined(GNUC) |
|
#define USE_STDC_FOR_SIMD 0 |
|
#else |
|
#define USE_STDC_FOR_SIMD 0 |
|
#endif |
|
|
|
#if !(defined(_X360) && (USE_STDC_FOR_SIMD == 0)) |
|
#define _SSE1 1 |
|
#endif |
|
|
|
// I thought about defining a class/union for the SIMD packed floats instead of using fltx4, |
|
// but decided against it because (a) the nature of SIMD code which includes comparisons is to blur |
|
// the relationship between packed floats and packed integer types and (b) not sure that the |
|
// compiler would handle generating good code for the intrinsics. |
|
|
|
#if USE_STDC_FOR_SIMD |
|
|
|
typedef union |
|
{ |
|
float m128_f32[4]; |
|
uint32 m128_u32[4]; |
|
} fltx4; |
|
|
|
typedef fltx4 i32x4; |
|
typedef fltx4 u32x4; |
|
|
|
#elif ( defined( _X360 ) ) |
|
|
|
typedef union |
|
{ |
|
// This union allows float/int access (which generally shouldn't be done in inner loops) |
|
__vector4 vmx; |
|
float m128_f32[4]; |
|
uint32 m128_u32[4]; |
|
} fltx4_union; |
|
|
|
typedef __vector4 fltx4; |
|
typedef __vector4 i32x4; // a VMX register; just a way of making it explicit that we're doing integer ops. |
|
typedef __vector4 u32x4; // a VMX register; just a way of making it explicit that we're doing unsigned integer ops. |
|
|
|
#else |
|
|
|
typedef __m128 fltx4; |
|
typedef __m128 i32x4; |
|
typedef __m128 u32x4; |
|
|
|
#endif |
|
|
|
// The FLTX4 type is a fltx4 used as a parameter to a function. |
|
// On the 360, the best way to do this is pass-by-copy on the registers. |
|
// On the PC, the best way is to pass by const reference. |
|
// The compiler will sometimes, but not always, replace a pass-by-const-ref |
|
// with a pass-in-reg on the 360; to avoid this confusion, you can |
|
// explicitly use a FLTX4 as the parameter type. |
|
#ifdef _X360 |
|
typedef __vector4 FLTX4; |
|
#else |
|
typedef const fltx4 & FLTX4; |
|
#endif |
|
|
|
// A 16-byte aligned int32 datastructure |
|
// (for use when writing out fltx4's as SIGNED |
|
// ints). |
|
struct ALIGN16 intx4 |
|
{ |
|
int32 m_i32[4]; |
|
|
|
inline int & operator[](int which) |
|
{ |
|
return m_i32[which]; |
|
} |
|
|
|
inline const int & operator[](int which) const |
|
{ |
|
return m_i32[which]; |
|
} |
|
|
|
inline int32 *Base() { |
|
return m_i32; |
|
} |
|
|
|
inline const int32 *Base() const |
|
{ |
|
return m_i32; |
|
} |
|
|
|
inline const bool operator==(const intx4 &other) const |
|
{ |
|
return m_i32[0] == other.m_i32[0] && |
|
m_i32[1] == other.m_i32[1] && |
|
m_i32[2] == other.m_i32[2] && |
|
m_i32[3] == other.m_i32[3] ; |
|
} |
|
} ALIGN16_POST; |
|
|
|
|
|
#if defined( _DEBUG ) && defined( _X360 ) |
|
FORCEINLINE void TestVPUFlags() |
|
{ |
|
// Check that the VPU is in the appropriate (Java-compliant) mode (see 3.2.1 in altivec_pem.pdf on xds.xbox.com) |
|
__vector4 a; |
|
__asm |
|
{ |
|
mfvscr a; |
|
} |
|
unsigned int * flags = (unsigned int *)&a; |
|
unsigned int controlWord = flags[3]; |
|
Assert(controlWord == 0); |
|
} |
|
#else // _DEBUG |
|
FORCEINLINE void TestVPUFlags() {} |
|
#endif // _DEBUG |
|
|
|
|
|
// useful constants in SIMD packed float format: |
|
// (note: some of these aren't stored on the 360, |
|
// but are manufactured directly in one or two |
|
// instructions, saving a load and possible L2 |
|
// miss.) |
|
#ifndef _X360 |
|
extern const fltx4 Four_Zeros; // 0 0 0 0 |
|
extern const fltx4 Four_Ones; // 1 1 1 1 |
|
extern const fltx4 Four_Twos; // 2 2 2 2 |
|
extern const fltx4 Four_Threes; // 3 3 3 3 |
|
extern const fltx4 Four_Fours; // guess. |
|
extern const fltx4 Four_Point225s; // .225 .225 .225 .225 |
|
extern const fltx4 Four_PointFives; // .5 .5 .5 .5 |
|
extern const fltx4 Four_Epsilons; // FLT_EPSILON FLT_EPSILON FLT_EPSILON FLT_EPSILON |
|
extern const fltx4 Four_2ToThe21s; // (1<<21).. |
|
extern const fltx4 Four_2ToThe22s; // (1<<22).. |
|
extern const fltx4 Four_2ToThe23s; // (1<<23).. |
|
extern const fltx4 Four_2ToThe24s; // (1<<24).. |
|
extern const fltx4 Four_Origin; // 0 0 0 1 (origin point, like vr0 on the PS2) |
|
extern const fltx4 Four_NegativeOnes; // -1 -1 -1 -1 |
|
#else |
|
#define Four_Zeros XMVectorZero() // 0 0 0 0 |
|
#define Four_Ones XMVectorSplatOne() // 1 1 1 1 |
|
extern const fltx4 Four_Twos; // 2 2 2 2 |
|
extern const fltx4 Four_Threes; // 3 3 3 3 |
|
extern const fltx4 Four_Fours; // guess. |
|
extern const fltx4 Four_Point225s; // .225 .225 .225 .225 |
|
extern const fltx4 Four_PointFives; // .5 .5 .5 .5 |
|
extern const fltx4 Four_Epsilons; // FLT_EPSILON FLT_EPSILON FLT_EPSILON FLT_EPSILON |
|
extern const fltx4 Four_2ToThe21s; // (1<<21).. |
|
extern const fltx4 Four_2ToThe22s; // (1<<22).. |
|
extern const fltx4 Four_2ToThe23s; // (1<<23).. |
|
extern const fltx4 Four_2ToThe24s; // (1<<24).. |
|
extern const fltx4 Four_Origin; // 0 0 0 1 (origin point, like vr0 on the PS2) |
|
extern const fltx4 Four_NegativeOnes; // -1 -1 -1 -1 |
|
#endif |
|
extern const fltx4 Four_FLT_MAX; // FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX |
|
extern const fltx4 Four_Negative_FLT_MAX; // -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX |
|
extern const fltx4 g_SIMD_0123; // 0 1 2 3 as float |
|
|
|
// external aligned integer constants |
|
extern const ALIGN16 uint32 g_SIMD_clear_signmask[] ALIGN16_POST; // 0x7fffffff x 4 |
|
extern const ALIGN16 uint32 g_SIMD_signmask[] ALIGN16_POST; // 0x80000000 x 4 |
|
extern const ALIGN16 uint32 g_SIMD_lsbmask[] ALIGN16_POST; // 0xfffffffe x 4 |
|
extern const ALIGN16 uint32 g_SIMD_clear_wmask[] ALIGN16_POST; // -1 -1 -1 0 |
|
extern const ALIGN16 uint32 g_SIMD_ComponentMask[4][4] ALIGN16_POST; // [0xFFFFFFFF 0 0 0], [0 0xFFFFFFFF 0 0], [0 0 0xFFFFFFFF 0], [0 0 0 0xFFFFFFFF] |
|
extern const ALIGN16 uint32 g_SIMD_AllOnesMask[] ALIGN16_POST; // ~0,~0,~0,~0 |
|
extern const ALIGN16 uint32 g_SIMD_Low16BitsMask[] ALIGN16_POST; // 0xffff x 4 |
|
|
|
// this mask is used for skipping the tail of things. If you have N elements in an array, and wish |
|
// to mask out the tail, g_SIMD_SkipTailMask[N & 3] what you want to use for the last iteration. |
|
extern const uint32 ALIGN16 g_SIMD_SkipTailMask[4][4] ALIGN16_POST; |
|
|
|
// Define prefetch macros. |
|
// The characteristics of cache and prefetch are completely |
|
// different between the different platforms, so you DO NOT |
|
// want to just define one macro that maps to every platform |
|
// intrinsic under the hood -- you need to prefetch at different |
|
// intervals between x86 and PPC, for example, and that is |
|
// a higher level code change. |
|
// On the other hand, I'm tired of typing #ifdef _X360 |
|
// all over the place, so this is just a nop on Intel, PS3. |
|
#ifdef _X360 |
|
#define PREFETCH360(address, offset) __dcbt(offset,address) |
|
#else |
|
#define PREFETCH360(x,y) // nothing |
|
#endif |
|
|
|
#if USE_STDC_FOR_SIMD |
|
|
|
//--------------------------------------------------------------------- |
|
// Standard C (fallback/Linux) implementation (only there for compat - slow) |
|
//--------------------------------------------------------------------- |
|
|
|
FORCEINLINE float SubFloat( const fltx4 & a, int idx ) |
|
{ |
|
return a.m128_f32[ idx ]; |
|
} |
|
|
|
FORCEINLINE float & SubFloat( fltx4 & a, int idx ) |
|
{ |
|
return a.m128_f32[idx]; |
|
} |
|
|
|
FORCEINLINE uint32 SubInt( const fltx4 & a, int idx ) |
|
{ |
|
return a.m128_u32[idx]; |
|
} |
|
|
|
FORCEINLINE uint32 & SubInt( fltx4 & a, int idx ) |
|
{ |
|
return a.m128_u32[idx]; |
|
} |
|
|
|
// Return one in the fastest way -- on the x360, faster even than loading. |
|
FORCEINLINE fltx4 LoadZeroSIMD( void ) |
|
{ |
|
return Four_Zeros; |
|
} |
|
|
|
// Return one in the fastest way -- on the x360, faster even than loading. |
|
FORCEINLINE fltx4 LoadOneSIMD( void ) |
|
{ |
|
return Four_Ones; |
|
} |
|
|
|
FORCEINLINE fltx4 SplatXSIMD( const fltx4 & a ) |
|
{ |
|
fltx4 retVal; |
|
SubFloat( retVal, 0 ) = SubFloat( a, 0 ); |
|
SubFloat( retVal, 1 ) = SubFloat( a, 0 ); |
|
SubFloat( retVal, 2 ) = SubFloat( a, 0 ); |
|
SubFloat( retVal, 3 ) = SubFloat( a, 0 ); |
|
return retVal; |
|
} |
|
|
|
FORCEINLINE fltx4 SplatYSIMD( fltx4 a ) |
|
{ |
|
fltx4 retVal; |
|
SubFloat( retVal, 0 ) = SubFloat( a, 1 ); |
|
SubFloat( retVal, 1 ) = SubFloat( a, 1 ); |
|
SubFloat( retVal, 2 ) = SubFloat( a, 1 ); |
|
SubFloat( retVal, 3 ) = SubFloat( a, 1 ); |
|
return retVal; |
|
} |
|
|
|
FORCEINLINE fltx4 SplatZSIMD( fltx4 a ) |
|
{ |
|
fltx4 retVal; |
|
SubFloat( retVal, 0 ) = SubFloat( a, 2 ); |
|
SubFloat( retVal, 1 ) = SubFloat( a, 2 ); |
|
SubFloat( retVal, 2 ) = SubFloat( a, 2 ); |
|
SubFloat( retVal, 3 ) = SubFloat( a, 2 ); |
|
return retVal; |
|
} |
|
|
|
FORCEINLINE fltx4 SplatWSIMD( fltx4 a ) |
|
{ |
|
fltx4 retVal; |
|
SubFloat( retVal, 0 ) = SubFloat( a, 3 ); |
|
SubFloat( retVal, 1 ) = SubFloat( a, 3 ); |
|
SubFloat( retVal, 2 ) = SubFloat( a, 3 ); |
|
SubFloat( retVal, 3 ) = SubFloat( a, 3 ); |
|
return retVal; |
|
} |
|
|
|
FORCEINLINE fltx4 SetXSIMD( const fltx4& a, const fltx4& x ) |
|
{ |
|
fltx4 result = a; |
|
SubFloat( result, 0 ) = SubFloat( x, 0 ); |
|
return result; |
|
} |
|
|
|
FORCEINLINE fltx4 SetYSIMD( const fltx4& a, const fltx4& y ) |
|
{ |
|
fltx4 result = a; |
|
SubFloat( result, 1 ) = SubFloat( y, 1 ); |
|
return result; |
|
} |
|
|
|
FORCEINLINE fltx4 SetZSIMD( const fltx4& a, const fltx4& z ) |
|
{ |
|
fltx4 result = a; |
|
SubFloat( result, 2 ) = SubFloat( z, 2 ); |
|
return result; |
|
} |
|
|
|
FORCEINLINE fltx4 SetWSIMD( const fltx4& a, const fltx4& w ) |
|
{ |
|
fltx4 result = a; |
|
SubFloat( result, 3 ) = SubFloat( w, 3 ); |
|
return result; |
|
} |
|
|
|
FORCEINLINE fltx4 SetComponentSIMD( const fltx4& a, int nComponent, float flValue ) |
|
{ |
|
fltx4 result = a; |
|
SubFloat( result, nComponent ) = flValue; |
|
return result; |
|
} |
|
|
|
// a b c d -> b c d a |
|
FORCEINLINE fltx4 RotateLeft( const fltx4 & a ) |
|
{ |
|
fltx4 retVal; |
|
SubFloat( retVal, 0 ) = SubFloat( a, 1 ); |
|
SubFloat( retVal, 1 ) = SubFloat( a, 2 ); |
|
SubFloat( retVal, 2 ) = SubFloat( a, 3 ); |
|
SubFloat( retVal, 3 ) = SubFloat( a, 0 ); |
|
return retVal; |
|
} |
|
|
|
// a b c d -> c d a b |
|
FORCEINLINE fltx4 RotateLeft2( const fltx4 & a ) |
|
{ |
|
fltx4 retVal; |
|
SubFloat( retVal, 0 ) = SubFloat( a, 2 ); |
|
SubFloat( retVal, 1 ) = SubFloat( a, 3 ); |
|
SubFloat( retVal, 2 ) = SubFloat( a, 0 ); |
|
SubFloat( retVal, 3 ) = SubFloat( a, 1 ); |
|
return retVal; |
|
} |
|
|
|
#define BINOP(op) \ |
|
fltx4 retVal; \ |
|
SubFloat( retVal, 0 ) = ( SubFloat( a, 0 ) op SubFloat( b, 0 ) ); \ |
|
SubFloat( retVal, 1 ) = ( SubFloat( a, 1 ) op SubFloat( b, 1 ) ); \ |
|
SubFloat( retVal, 2 ) = ( SubFloat( a, 2 ) op SubFloat( b, 2 ) ); \ |
|
SubFloat( retVal, 3 ) = ( SubFloat( a, 3 ) op SubFloat( b, 3 ) ); \ |
|
return retVal; |
|
|
|
#define IBINOP(op) \ |
|
fltx4 retVal; \ |
|
SubInt( retVal, 0 ) = ( SubInt( a, 0 ) op SubInt ( b, 0 ) ); \ |
|
SubInt( retVal, 1 ) = ( SubInt( a, 1 ) op SubInt ( b, 1 ) ); \ |
|
SubInt( retVal, 2 ) = ( SubInt( a, 2 ) op SubInt ( b, 2 ) ); \ |
|
SubInt( retVal, 3 ) = ( SubInt( a, 3 ) op SubInt ( b, 3 ) ); \ |
|
return retVal; |
|
|
|
FORCEINLINE fltx4 AddSIMD( const fltx4 & a, const fltx4 & b ) |
|
{ |
|
BINOP(+); |
|
} |
|
|
|
FORCEINLINE fltx4 SubSIMD( const fltx4 & a, const fltx4 & b ) // a-b |
|
{ |
|
BINOP(-); |
|
}; |
|
|
|
FORCEINLINE fltx4 MulSIMD( const fltx4 & a, const fltx4 & b ) // a*b |
|
{ |
|
BINOP(*); |
|
} |
|
|
|
FORCEINLINE fltx4 DivSIMD( const fltx4 & a, const fltx4 & b ) // a/b |
|
{ |
|
BINOP(/); |
|
} |
|
|
|
|
|
FORCEINLINE fltx4 MaddSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // a*b + c |
|
{ |
|
return AddSIMD( MulSIMD(a,b), c ); |
|
} |
|
|
|
FORCEINLINE fltx4 MsubSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // c - a*b |
|
{ |
|
return SubSIMD( c, MulSIMD(a,b) ); |
|
}; |
|
|
|
|
|
FORCEINLINE fltx4 SinSIMD( const fltx4 &radians ) |
|
{ |
|
fltx4 result; |
|
SubFloat( result, 0 ) = sin( SubFloat( radians, 0 ) ); |
|
SubFloat( result, 1 ) = sin( SubFloat( radians, 1 ) ); |
|
SubFloat( result, 2 ) = sin( SubFloat( radians, 2 ) ); |
|
SubFloat( result, 3 ) = sin( SubFloat( radians, 3 ) ); |
|
return result; |
|
} |
|
|
|
FORCEINLINE void SinCos3SIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians ) |
|
{ |
|
SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) ); |
|
SinCos( SubFloat( radians, 1 ), &SubFloat( sine, 1 ), &SubFloat( cosine, 1 ) ); |
|
SinCos( SubFloat( radians, 2 ), &SubFloat( sine, 2 ), &SubFloat( cosine, 2 ) ); |
|
} |
|
|
|
FORCEINLINE void SinCosSIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians ) |
|
{ |
|
SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) ); |
|
SinCos( SubFloat( radians, 1 ), &SubFloat( sine, 1 ), &SubFloat( cosine, 1 ) ); |
|
SinCos( SubFloat( radians, 2 ), &SubFloat( sine, 2 ), &SubFloat( cosine, 2 ) ); |
|
SinCos( SubFloat( radians, 3 ), &SubFloat( sine, 3 ), &SubFloat( cosine, 3 ) ); |
|
} |
|
|
|
FORCEINLINE fltx4 ArcSinSIMD( const fltx4 &sine ) |
|
{ |
|
fltx4 result; |
|
SubFloat( result, 0 ) = asin( SubFloat( sine, 0 ) ); |
|
SubFloat( result, 1 ) = asin( SubFloat( sine, 1 ) ); |
|
SubFloat( result, 2 ) = asin( SubFloat( sine, 2 ) ); |
|
SubFloat( result, 3 ) = asin( SubFloat( sine, 3 ) ); |
|
return result; |
|
} |
|
|
|
FORCEINLINE fltx4 ArcCosSIMD( const fltx4 &cs ) |
|
{ |
|
fltx4 result; |
|
SubFloat( result, 0 ) = acos( SubFloat( cs, 0 ) ); |
|
SubFloat( result, 1 ) = acos( SubFloat( cs, 1 ) ); |
|
SubFloat( result, 2 ) = acos( SubFloat( cs, 2 ) ); |
|
SubFloat( result, 3 ) = acos( SubFloat( cs, 3 ) ); |
|
return result; |
|
} |
|
|
|
// tan^1(a/b) .. ie, pass sin in as a and cos in as b |
|
FORCEINLINE fltx4 ArcTan2SIMD( const fltx4 &a, const fltx4 &b ) |
|
{ |
|
fltx4 result; |
|
SubFloat( result, 0 ) = atan2( SubFloat( a, 0 ), SubFloat( b, 0 ) ); |
|
SubFloat( result, 1 ) = atan2( SubFloat( a, 1 ), SubFloat( b, 1 ) ); |
|
SubFloat( result, 2 ) = atan2( SubFloat( a, 2 ), SubFloat( b, 2 ) ); |
|
SubFloat( result, 3 ) = atan2( SubFloat( a, 3 ), SubFloat( b, 3 ) ); |
|
return result; |
|
} |
|
|
|
FORCEINLINE fltx4 MaxSIMD( const fltx4 & a, const fltx4 & b ) // max(a,b) |
|
{ |
|
fltx4 retVal; |
|
SubFloat( retVal, 0 ) = max( SubFloat( a, 0 ), SubFloat( b, 0 ) ); |
|
SubFloat( retVal, 1 ) = max( SubFloat( a, 1 ), SubFloat( b, 1 ) ); |
|
SubFloat( retVal, 2 ) = max( SubFloat( a, 2 ), SubFloat( b, 2 ) ); |
|
SubFloat( retVal, 3 ) = max( SubFloat( a, 3 ), SubFloat( b, 3 ) ); |
|
return retVal; |
|
} |
|
|
|
FORCEINLINE fltx4 MinSIMD( const fltx4 & a, const fltx4 & b ) // min(a,b) |
|
{ |
|
fltx4 retVal; |
|
SubFloat( retVal, 0 ) = min( SubFloat( a, 0 ), SubFloat( b, 0 ) ); |
|
SubFloat( retVal, 1 ) = min( SubFloat( a, 1 ), SubFloat( b, 1 ) ); |
|
SubFloat( retVal, 2 ) = min( SubFloat( a, 2 ), SubFloat( b, 2 ) ); |
|
SubFloat( retVal, 3 ) = min( SubFloat( a, 3 ), SubFloat( b, 3 ) ); |
|
return retVal; |
|
} |
|
|
|
FORCEINLINE fltx4 AndSIMD( const fltx4 & a, const fltx4 & b ) // a & b |
|
{ |
|
IBINOP(&); |
|
} |
|
|
|
FORCEINLINE fltx4 AndNotSIMD( const fltx4 & a, const fltx4 & b ) // ~a & b |
|
{ |
|
fltx4 retVal; |
|
SubInt( retVal, 0 ) = ~SubInt( a, 0 ) & SubInt( b, 0 ); |
|
SubInt( retVal, 1 ) = ~SubInt( a, 1 ) & SubInt( b, 1 ); |
|
SubInt( retVal, 2 ) = ~SubInt( a, 2 ) & SubInt( b, 2 ); |
|
SubInt( retVal, 3 ) = ~SubInt( a, 3 ) & SubInt( b, 3 ); |
|
return retVal; |
|
} |
|
|
|
FORCEINLINE fltx4 XorSIMD( const fltx4 & a, const fltx4 & b ) // a ^ b |
|
{ |
|
IBINOP(^); |
|
} |
|
|
|
FORCEINLINE fltx4 OrSIMD( const fltx4 & a, const fltx4 & b ) // a | b |
|
{ |
|
IBINOP(|); |
|
} |
|
|
|
FORCEINLINE fltx4 NegSIMD(const fltx4 &a) // negate: -a |
|
{ |
|
fltx4 retval; |
|
SubFloat( retval, 0 ) = -SubFloat( a, 0 ); |
|
SubFloat( retval, 1 ) = -SubFloat( a, 1 ); |
|
SubFloat( retval, 2 ) = -SubFloat( a, 2 ); |
|
SubFloat( retval, 3 ) = -SubFloat( a, 3 ); |
|
|
|
return retval; |
|
} |
|
|
|
FORCEINLINE bool IsAllZeros( const fltx4 & a ) // all floats of a zero? |
|
{ |
|
return ( SubFloat( a, 0 ) == 0.0 ) && |
|
( SubFloat( a, 1 ) == 0.0 ) && |
|
( SubFloat( a, 2 ) == 0.0 ) && |
|
( SubFloat( a, 3 ) == 0.0 ) ; |
|
} |
|
|
|
|
|
// for branching when a.xyzw > b.xyzw |
|
FORCEINLINE bool IsAllGreaterThan( const fltx4 &a, const fltx4 &b ) |
|
{ |
|
return SubFloat(a,0) > SubFloat(b,0) && |
|
SubFloat(a,1) > SubFloat(b,1) && |
|
SubFloat(a,2) > SubFloat(b,2) && |
|
SubFloat(a,3) > SubFloat(b,3); |
|
} |
|
|
|
// for branching when a.xyzw >= b.xyzw |
|
FORCEINLINE bool IsAllGreaterThanOrEq( const fltx4 &a, const fltx4 &b ) |
|
{ |
|
return SubFloat(a,0) >= SubFloat(b,0) && |
|
SubFloat(a,1) >= SubFloat(b,1) && |
|
SubFloat(a,2) >= SubFloat(b,2) && |
|
SubFloat(a,3) >= SubFloat(b,3); |
|
} |
|
|
|
// For branching if all a.xyzw == b.xyzw |
|
FORCEINLINE bool IsAllEqual( const fltx4 & a, const fltx4 & b ) |
|
{ |
|
return SubFloat(a,0) == SubFloat(b,0) && |
|
SubFloat(a,1) == SubFloat(b,1) && |
|
SubFloat(a,2) == SubFloat(b,2) && |
|
SubFloat(a,3) == SubFloat(b,3); |
|
} |
|
|
|
FORCEINLINE int TestSignSIMD( const fltx4 & a ) // mask of which floats have the high bit set |
|
{ |
|
int nRet = 0; |
|
|
|
nRet |= ( SubInt( a, 0 ) & 0x80000000 ) >> 31; // sign(x) -> bit 0 |
|
nRet |= ( SubInt( a, 1 ) & 0x80000000 ) >> 30; // sign(y) -> bit 1 |
|
nRet |= ( SubInt( a, 2 ) & 0x80000000 ) >> 29; // sign(z) -> bit 2 |
|
nRet |= ( SubInt( a, 3 ) & 0x80000000 ) >> 28; // sign(w) -> bit 3 |
|
|
|
return nRet; |
|
} |
|
|
|
FORCEINLINE bool IsAnyNegative( const fltx4 & a ) // (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0) |
|
{ |
|
return (0 != TestSignSIMD( a )); |
|
} |
|
|
|
FORCEINLINE fltx4 CmpEqSIMD( const fltx4 & a, const fltx4 & b ) // (a==b) ? ~0:0 |
|
{ |
|
fltx4 retVal; |
|
SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) == SubFloat( b, 0 )) ? ~0 : 0; |
|
SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) == SubFloat( b, 1 )) ? ~0 : 0; |
|
SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) == SubFloat( b, 2 )) ? ~0 : 0; |
|
SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) == SubFloat( b, 3 )) ? ~0 : 0; |
|
return retVal; |
|
} |
|
|
|
FORCEINLINE fltx4 CmpGtSIMD( const fltx4 & a, const fltx4 & b ) // (a>b) ? ~0:0 |
|
{ |
|
fltx4 retVal; |
|
SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) > SubFloat( b, 0 )) ? ~0 : 0; |
|
SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) > SubFloat( b, 1 )) ? ~0 : 0; |
|
SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) > SubFloat( b, 2 )) ? ~0 : 0; |
|
SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) > SubFloat( b, 3 )) ? ~0 : 0; |
|
return retVal; |
|
} |
|
|
|
FORCEINLINE fltx4 CmpGeSIMD( const fltx4 & a, const fltx4 & b ) // (a>=b) ? ~0:0 |
|
{ |
|
fltx4 retVal; |
|
SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) >= SubFloat( b, 0 )) ? ~0 : 0; |
|
SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) >= SubFloat( b, 1 )) ? ~0 : 0; |
|
SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) >= SubFloat( b, 2 )) ? ~0 : 0; |
|
SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) >= SubFloat( b, 3 )) ? ~0 : 0; |
|
return retVal; |
|
} |
|
|
|
FORCEINLINE fltx4 CmpLtSIMD( const fltx4 & a, const fltx4 & b ) // (a<b) ? ~0:0 |
|
{ |
|
fltx4 retVal; |
|
SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) < SubFloat( b, 0 )) ? ~0 : 0; |
|
SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) < SubFloat( b, 1 )) ? ~0 : 0; |
|
SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) < SubFloat( b, 2 )) ? ~0 : 0; |
|
SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) < SubFloat( b, 3 )) ? ~0 : 0; |
|
return retVal; |
|
} |
|
|
|
FORCEINLINE fltx4 CmpLeSIMD( const fltx4 & a, const fltx4 & b ) // (a<=b) ? ~0:0 |
|
{ |
|
fltx4 retVal; |
|
SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) <= SubFloat( b, 0 )) ? ~0 : 0; |
|
SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) <= SubFloat( b, 1 )) ? ~0 : 0; |
|
SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) <= SubFloat( b, 2 )) ? ~0 : 0; |
|
SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) <= SubFloat( b, 3 )) ? ~0 : 0; |
|
return retVal; |
|
} |
|
|
|
FORCEINLINE fltx4 CmpInBoundsSIMD( const fltx4 & a, const fltx4 & b ) // (a <= b && a >= -b) ? ~0 : 0 |
|
{ |
|
fltx4 retVal; |
|
SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) <= SubFloat( b, 0 ) && SubFloat( a, 0 ) >= -SubFloat( b, 0 ) ) ? ~0 : 0; |
|
SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) <= SubFloat( b, 1 ) && SubFloat( a, 1 ) >= -SubFloat( b, 1 ) ) ? ~0 : 0; |
|
SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) <= SubFloat( b, 2 ) && SubFloat( a, 2 ) >= -SubFloat( b, 2 ) ) ? ~0 : 0; |
|
SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) <= SubFloat( b, 3 ) && SubFloat( a, 3 ) >= -SubFloat( b, 3 ) ) ? ~0 : 0; |
|
return retVal; |
|
} |
|
|
|
|
|
FORCEINLINE fltx4 MaskedAssign( const fltx4 & ReplacementMask, const fltx4 & NewValue, const fltx4 & OldValue ) |
|
{ |
|
return OrSIMD( |
|
AndSIMD( ReplacementMask, NewValue ), |
|
AndNotSIMD( ReplacementMask, OldValue ) ); |
|
} |
|
|
|
FORCEINLINE fltx4 ReplicateX4( float flValue ) // a,a,a,a |
|
{ |
|
fltx4 retVal; |
|
SubFloat( retVal, 0 ) = flValue; |
|
SubFloat( retVal, 1 ) = flValue; |
|
SubFloat( retVal, 2 ) = flValue; |
|
SubFloat( retVal, 3 ) = flValue; |
|
return retVal; |
|
} |
|
|
|
/// replicate a single 32 bit integer value to all 4 components of an m128 |
|
FORCEINLINE fltx4 ReplicateIX4( int nValue ) |
|
{ |
|
fltx4 retVal; |
|
SubInt( retVal, 0 ) = nValue; |
|
SubInt( retVal, 1 ) = nValue; |
|
SubInt( retVal, 2 ) = nValue; |
|
SubInt( retVal, 3 ) = nValue; |
|
return retVal; |
|
|
|
} |
|
|
|
// Round towards positive infinity |
|
FORCEINLINE fltx4 CeilSIMD( const fltx4 &a ) |
|
{ |
|
fltx4 retVal; |
|
SubFloat( retVal, 0 ) = ceil( SubFloat( a, 0 ) ); |
|
SubFloat( retVal, 1 ) = ceil( SubFloat( a, 1 ) ); |
|
SubFloat( retVal, 2 ) = ceil( SubFloat( a, 2 ) ); |
|
SubFloat( retVal, 3 ) = ceil( SubFloat( a, 3 ) ); |
|
return retVal; |
|
|
|
} |
|
|
|
// Round towards negative infinity |
|
FORCEINLINE fltx4 FloorSIMD( const fltx4 &a ) |
|
{ |
|
fltx4 retVal; |
|
SubFloat( retVal, 0 ) = floor( SubFloat( a, 0 ) ); |
|
SubFloat( retVal, 1 ) = floor( SubFloat( a, 1 ) ); |
|
SubFloat( retVal, 2 ) = floor( SubFloat( a, 2 ) ); |
|
SubFloat( retVal, 3 ) = floor( SubFloat( a, 3 ) ); |
|
return retVal; |
|
|
|
} |
|
|
|
FORCEINLINE fltx4 SqrtEstSIMD( const fltx4 & a ) // sqrt(a), more or less |
|
{ |
|
fltx4 retVal; |
|
SubFloat( retVal, 0 ) = sqrt( SubFloat( a, 0 ) ); |
|
SubFloat( retVal, 1 ) = sqrt( SubFloat( a, 1 ) ); |
|
SubFloat( retVal, 2 ) = sqrt( SubFloat( a, 2 ) ); |
|
SubFloat( retVal, 3 ) = sqrt( SubFloat( a, 3 ) ); |
|
return retVal; |
|
} |
|
|
|
FORCEINLINE fltx4 SqrtSIMD( const fltx4 & a ) // sqrt(a) |
|
{ |
|
fltx4 retVal; |
|
SubFloat( retVal, 0 ) = sqrt( SubFloat( a, 0 ) ); |
|
SubFloat( retVal, 1 ) = sqrt( SubFloat( a, 1 ) ); |
|
SubFloat( retVal, 2 ) = sqrt( SubFloat( a, 2 ) ); |
|
SubFloat( retVal, 3 ) = sqrt( SubFloat( a, 3 ) ); |
|
return retVal; |
|
} |
|
|
|
FORCEINLINE fltx4 ReciprocalSqrtEstSIMD( const fltx4 & a ) // 1/sqrt(a), more or less |
|
{ |
|
fltx4 retVal; |
|
SubFloat( retVal, 0 ) = 1.0 / sqrt( SubFloat( a, 0 ) ); |
|
SubFloat( retVal, 1 ) = 1.0 / sqrt( SubFloat( a, 1 ) ); |
|
SubFloat( retVal, 2 ) = 1.0 / sqrt( SubFloat( a, 2 ) ); |
|
SubFloat( retVal, 3 ) = 1.0 / sqrt( SubFloat( a, 3 ) ); |
|
return retVal; |
|
} |
|
|
|
FORCEINLINE fltx4 ReciprocalSqrtEstSaturateSIMD( const fltx4 & a ) |
|
{ |
|
fltx4 retVal; |
|
SubFloat( retVal, 0 ) = 1.0 / sqrt( SubFloat( a, 0 ) != 0.0f ? SubFloat( a, 0 ) : FLT_EPSILON ); |
|
SubFloat( retVal, 1 ) = 1.0 / sqrt( SubFloat( a, 1 ) != 0.0f ? SubFloat( a, 1 ) : FLT_EPSILON ); |
|
SubFloat( retVal, 2 ) = 1.0 / sqrt( SubFloat( a, 2 ) != 0.0f ? SubFloat( a, 2 ) : FLT_EPSILON ); |
|
SubFloat( retVal, 3 ) = 1.0 / sqrt( SubFloat( a, 3 ) != 0.0f ? SubFloat( a, 3 ) : FLT_EPSILON ); |
|
return retVal; |
|
} |
|
|
|
FORCEINLINE fltx4 ReciprocalSqrtSIMD( const fltx4 & a ) // 1/sqrt(a) |
|
{ |
|
fltx4 retVal; |
|
SubFloat( retVal, 0 ) = 1.0 / sqrt( SubFloat( a, 0 ) ); |
|
SubFloat( retVal, 1 ) = 1.0 / sqrt( SubFloat( a, 1 ) ); |
|
SubFloat( retVal, 2 ) = 1.0 / sqrt( SubFloat( a, 2 ) ); |
|
SubFloat( retVal, 3 ) = 1.0 / sqrt( SubFloat( a, 3 ) ); |
|
return retVal; |
|
} |
|
|
|
FORCEINLINE fltx4 ReciprocalEstSIMD( const fltx4 & a ) // 1/a, more or less |
|
{ |
|
fltx4 retVal; |
|
SubFloat( retVal, 0 ) = 1.0 / SubFloat( a, 0 ); |
|
SubFloat( retVal, 1 ) = 1.0 / SubFloat( a, 1 ); |
|
SubFloat( retVal, 2 ) = 1.0 / SubFloat( a, 2 ); |
|
SubFloat( retVal, 3 ) = 1.0 / SubFloat( a, 3 ); |
|
return retVal; |
|
} |
|
|
|
FORCEINLINE fltx4 ReciprocalSIMD( const fltx4 & a ) // 1/a |
|
{ |
|
fltx4 retVal; |
|
SubFloat( retVal, 0 ) = 1.0 / SubFloat( a, 0 ); |
|
SubFloat( retVal, 1 ) = 1.0 / SubFloat( a, 1 ); |
|
SubFloat( retVal, 2 ) = 1.0 / SubFloat( a, 2 ); |
|
SubFloat( retVal, 3 ) = 1.0 / SubFloat( a, 3 ); |
|
return retVal; |
|
} |
|
|
|
/// 1/x for all 4 values. |
|
/// 1/0 will result in a big but NOT infinite result |
|
FORCEINLINE fltx4 ReciprocalEstSaturateSIMD( const fltx4 & a ) |
|
{ |
|
fltx4 retVal; |
|
SubFloat( retVal, 0 ) = 1.0 / (SubFloat( a, 0 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 0 )); |
|
SubFloat( retVal, 1 ) = 1.0 / (SubFloat( a, 1 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 1 )); |
|
SubFloat( retVal, 2 ) = 1.0 / (SubFloat( a, 2 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 2 )); |
|
SubFloat( retVal, 3 ) = 1.0 / (SubFloat( a, 3 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 3 )); |
|
return retVal; |
|
} |
|
|
|
FORCEINLINE fltx4 ReciprocalSaturateSIMD( const fltx4 & a ) |
|
{ |
|
fltx4 retVal; |
|
SubFloat( retVal, 0 ) = 1.0 / (SubFloat( a, 0 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 0 )); |
|
SubFloat( retVal, 1 ) = 1.0 / (SubFloat( a, 1 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 1 )); |
|
SubFloat( retVal, 2 ) = 1.0 / (SubFloat( a, 2 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 2 )); |
|
SubFloat( retVal, 3 ) = 1.0 / (SubFloat( a, 3 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 3 )); |
|
return retVal; |
|
} |
|
|
|
// 2^x for all values (the antilog) |
|
FORCEINLINE fltx4 ExpSIMD( const fltx4 &toPower ) |
|
{ |
|
fltx4 retVal; |
|
SubFloat( retVal, 0 ) = powf( 2, SubFloat(toPower, 0) ); |
|
SubFloat( retVal, 1 ) = powf( 2, SubFloat(toPower, 1) ); |
|
SubFloat( retVal, 2 ) = powf( 2, SubFloat(toPower, 2) ); |
|
SubFloat( retVal, 3 ) = powf( 2, SubFloat(toPower, 3) ); |
|
|
|
return retVal; |
|
} |
|
|
|
FORCEINLINE fltx4 Dot3SIMD( const fltx4 &a, const fltx4 &b ) |
|
{ |
|
float flDot = SubFloat( a, 0 ) * SubFloat( b, 0 ) + |
|
SubFloat( a, 1 ) * SubFloat( b, 1 ) + |
|
SubFloat( a, 2 ) * SubFloat( b, 2 ); |
|
return ReplicateX4( flDot ); |
|
} |
|
|
|
FORCEINLINE fltx4 Dot4SIMD( const fltx4 &a, const fltx4 &b ) |
|
{ |
|
float flDot = SubFloat( a, 0 ) * SubFloat( b, 0 ) + |
|
SubFloat( a, 1 ) * SubFloat( b, 1 ) + |
|
SubFloat( a, 2 ) * SubFloat( b, 2 ) + |
|
SubFloat( a, 3 ) * SubFloat( b, 3 ); |
|
return ReplicateX4( flDot ); |
|
} |
|
|
|
// Clamps the components of a vector to a specified minimum and maximum range. |
|
FORCEINLINE fltx4 ClampVectorSIMD( FLTX4 in, FLTX4 min, FLTX4 max) |
|
{ |
|
return MaxSIMD( min, MinSIMD( max, in ) ); |
|
} |
|
|
|
// Squelch the w component of a vector to +0.0. |
|
// Most efficient when you say a = SetWToZeroSIMD(a) (avoids a copy) |
|
FORCEINLINE fltx4 SetWToZeroSIMD( const fltx4 & a ) |
|
{ |
|
fltx4 retval; |
|
retval = a; |
|
SubFloat( retval, 0 ) = 0; |
|
return retval; |
|
} |
|
|
|
FORCEINLINE fltx4 LoadUnalignedSIMD( const void *pSIMD ) |
|
{ |
|
return *( reinterpret_cast< const fltx4 *> ( pSIMD ) ); |
|
} |
|
|
|
FORCEINLINE fltx4 LoadUnaligned3SIMD( const void *pSIMD ) |
|
{ |
|
return *( reinterpret_cast< const fltx4 *> ( pSIMD ) ); |
|
} |
|
|
|
FORCEINLINE fltx4 LoadAlignedSIMD( const void *pSIMD ) |
|
{ |
|
return *( reinterpret_cast< const fltx4 *> ( pSIMD ) ); |
|
} |
|
|
|
// for the transitional class -- load a 3-by VectorAligned and squash its w component |
|
FORCEINLINE fltx4 LoadAlignedSIMD( const VectorAligned & pSIMD ) |
|
{ |
|
fltx4 retval = LoadAlignedSIMD(pSIMD.Base()); |
|
// squelch w |
|
SubInt( retval, 3 ) = 0; |
|
return retval; |
|
} |
|
|
|
FORCEINLINE void StoreAlignedSIMD( float *pSIMD, const fltx4 & a ) |
|
{ |
|
*( reinterpret_cast< fltx4 *> ( pSIMD ) ) = a; |
|
} |
|
|
|
FORCEINLINE void StoreUnalignedSIMD( float *pSIMD, const fltx4 & a ) |
|
{ |
|
*( reinterpret_cast< fltx4 *> ( pSIMD ) ) = a; |
|
} |
|
|
|
FORCEINLINE void StoreUnaligned3SIMD( float *pSIMD, const fltx4 & a ) |
|
{ |
|
*pSIMD = SubFloat(a, 0); |
|
*(pSIMD+1) = SubFloat(a, 1); |
|
*(pSIMD+2) = SubFloat(a, 2); |
|
} |
|
|
|
// strongly typed -- syntactic castor oil used for typechecking as we transition to SIMD |
|
FORCEINLINE void StoreAligned3SIMD( VectorAligned * RESTRICT pSIMD, const fltx4 & a ) |
|
{ |
|
StoreAlignedSIMD(pSIMD->Base(),a); |
|
} |
|
|
|
FORCEINLINE void TransposeSIMD( fltx4 & x, fltx4 & y, fltx4 & z, fltx4 & w ) |
|
{ |
|
#define SWAP_FLOATS( _a_, _ia_, _b_, _ib_ ) { float tmp = SubFloat( _a_, _ia_ ); SubFloat( _a_, _ia_ ) = SubFloat( _b_, _ib_ ); SubFloat( _b_, _ib_ ) = tmp; } |
|
SWAP_FLOATS( x, 1, y, 0 ); |
|
SWAP_FLOATS( x, 2, z, 0 ); |
|
SWAP_FLOATS( x, 3, w, 0 ); |
|
SWAP_FLOATS( y, 2, z, 1 ); |
|
SWAP_FLOATS( y, 3, w, 1 ); |
|
SWAP_FLOATS( z, 3, w, 2 ); |
|
} |
|
|
|
// find the lowest component of a.x, a.y, a.z, |
|
// and replicate it to the whole return value. |
|
FORCEINLINE fltx4 FindLowestSIMD3( const fltx4 & a ) |
|
{ |
|
float lowest = min( min( SubFloat(a, 0), SubFloat(a, 1) ), SubFloat(a, 2)); |
|
return ReplicateX4(lowest); |
|
} |
|
|
|
// find the highest component of a.x, a.y, a.z, |
|
// and replicate it to the whole return value. |
|
FORCEINLINE fltx4 FindHighestSIMD3( const fltx4 & a ) |
|
{ |
|
float highest = max( max( SubFloat(a, 0), SubFloat(a, 1) ), SubFloat(a, 2)); |
|
return ReplicateX4(highest); |
|
} |
|
|
|
// Fixed-point conversion and save as SIGNED INTS. |
|
// pDest->x = Int (vSrc.x) |
|
// note: some architectures have means of doing |
|
// fixed point conversion when the fix depth is |
|
// specified as an immediate.. but there is no way |
|
// to guarantee an immediate as a parameter to function |
|
// like this. |
|
FORCEINLINE void ConvertStoreAsIntsSIMD(intx4 * RESTRICT pDest, const fltx4 &vSrc) |
|
{ |
|
(*pDest)[0] = SubFloat(vSrc, 0); |
|
(*pDest)[1] = SubFloat(vSrc, 1); |
|
(*pDest)[2] = SubFloat(vSrc, 2); |
|
(*pDest)[3] = SubFloat(vSrc, 3); |
|
} |
|
|
|
// ------------------------------------ |
|
// INTEGER SIMD OPERATIONS. |
|
// ------------------------------------ |
|
// splat all components of a vector to a signed immediate int number. |
|
FORCEINLINE fltx4 IntSetImmediateSIMD( int nValue ) |
|
{ |
|
fltx4 retval; |
|
SubInt( retval, 0 ) = SubInt( retval, 1 ) = SubInt( retval, 2 ) = SubInt( retval, 3) = nValue; |
|
return retval; |
|
} |
|
|
|
// Load 4 aligned words into a SIMD register |
|
FORCEINLINE i32x4 LoadAlignedIntSIMD(const void * RESTRICT pSIMD) |
|
{ |
|
return *( reinterpret_cast< const i32x4 *> ( pSIMD ) ); |
|
} |
|
|
|
// Load 4 unaligned words into a SIMD register |
|
FORCEINLINE i32x4 LoadUnalignedIntSIMD( const void * RESTRICT pSIMD) |
|
{ |
|
return *( reinterpret_cast< const i32x4 *> ( pSIMD ) ); |
|
} |
|
|
|
// save into four words, 16-byte aligned |
|
FORCEINLINE void StoreAlignedIntSIMD( int32 *pSIMD, const fltx4 & a ) |
|
{ |
|
*( reinterpret_cast< i32x4 *> ( pSIMD ) ) = a; |
|
} |
|
|
|
FORCEINLINE void StoreAlignedIntSIMD( intx4 &pSIMD, const fltx4 & a ) |
|
{ |
|
*( reinterpret_cast< i32x4 *> ( pSIMD.Base() ) ) = a; |
|
} |
|
|
|
FORCEINLINE void StoreUnalignedIntSIMD( int32 *pSIMD, const fltx4 & a ) |
|
{ |
|
*( reinterpret_cast< i32x4 *> ( pSIMD ) ) = a; |
|
} |
|
|
|
// Take a fltx4 containing fixed-point uints and |
|
// return them as single precision floats. No |
|
// fixed point conversion is done. |
|
FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const u32x4 &vSrcA ) |
|
{ |
|
Assert(0); /* pc has no such operation */ |
|
fltx4 retval; |
|
SubFloat( retval, 0 ) = ( (float) SubInt( retval, 0 ) ); |
|
SubFloat( retval, 1 ) = ( (float) SubInt( retval, 1 ) ); |
|
SubFloat( retval, 2 ) = ( (float) SubInt( retval, 2 ) ); |
|
SubFloat( retval, 3 ) = ( (float) SubInt( retval, 3 ) ); |
|
return retval; |
|
} |
|
|
|
|
|
#if 0 /* pc has no such op */ |
|
// Take a fltx4 containing fixed-point sints and |
|
// return them as single precision floats. No |
|
// fixed point conversion is done. |
|
FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA ) |
|
{ |
|
fltx4 retval; |
|
SubFloat( retval, 0 ) = ( (float) (reinterpret_cast<int32 *>(&vSrcA.m128_s32[0])) ); |
|
SubFloat( retval, 1 ) = ( (float) (reinterpret_cast<int32 *>(&vSrcA.m128_s32[1])) ); |
|
SubFloat( retval, 2 ) = ( (float) (reinterpret_cast<int32 *>(&vSrcA.m128_s32[2])) ); |
|
SubFloat( retval, 3 ) = ( (float) (reinterpret_cast<int32 *>(&vSrcA.m128_s32[3])) ); |
|
return retval; |
|
} |
|
|
|
|
|
/* |
|
works on fltx4's as if they are four uints. |
|
the first parameter contains the words to be shifted, |
|
the second contains the amount to shift by AS INTS |
|
|
|
for i = 0 to 3 |
|
shift = vSrcB_i*32:(i*32)+4 |
|
vReturned_i*32:(i*32)+31 = vSrcA_i*32:(i*32)+31 << shift |
|
*/ |
|
FORCEINLINE i32x4 IntShiftLeftWordSIMD(const i32x4 &vSrcA, const i32x4 &vSrcB) |
|
{ |
|
i32x4 retval; |
|
SubInt(retval, 0) = SubInt(vSrcA, 0) << SubInt(vSrcB, 0); |
|
SubInt(retval, 1) = SubInt(vSrcA, 1) << SubInt(vSrcB, 1); |
|
SubInt(retval, 2) = SubInt(vSrcA, 2) << SubInt(vSrcB, 2); |
|
SubInt(retval, 3) = SubInt(vSrcA, 3) << SubInt(vSrcB, 3); |
|
|
|
|
|
return retval; |
|
} |
|
#endif |
|
|
|
#elif ( defined( _X360 ) ) |
|
|
|
//--------------------------------------------------------------------- |
|
// X360 implementation |
|
//--------------------------------------------------------------------- |
|
|
|
FORCEINLINE float & FloatSIMD( fltx4 & a, int idx ) |
|
{ |
|
fltx4_union & a_union = (fltx4_union &)a; |
|
return a_union.m128_f32[idx]; |
|
} |
|
|
|
FORCEINLINE unsigned int & UIntSIMD( fltx4 & a, int idx ) |
|
{ |
|
fltx4_union & a_union = (fltx4_union &)a; |
|
return a_union.m128_u32[idx]; |
|
} |
|
|
|
FORCEINLINE fltx4 AddSIMD( const fltx4 & a, const fltx4 & b ) |
|
{ |
|
return __vaddfp( a, b ); |
|
} |
|
|
|
FORCEINLINE fltx4 SubSIMD( const fltx4 & a, const fltx4 & b ) // a-b |
|
{ |
|
return __vsubfp( a, b ); |
|
} |
|
|
|
FORCEINLINE fltx4 MulSIMD( const fltx4 & a, const fltx4 & b ) // a*b |
|
{ |
|
return __vmulfp( a, b ); |
|
} |
|
|
|
FORCEINLINE fltx4 MaddSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // a*b + c |
|
{ |
|
return __vmaddfp( a, b, c ); |
|
} |
|
|
|
FORCEINLINE fltx4 MsubSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // c - a*b |
|
{ |
|
return __vnmsubfp( a, b, c ); |
|
}; |
|
|
|
FORCEINLINE fltx4 Dot3SIMD( const fltx4 &a, const fltx4 &b ) |
|
{ |
|
return __vmsum3fp( a, b ); |
|
} |
|
|
|
FORCEINLINE fltx4 Dot4SIMD( const fltx4 &a, const fltx4 &b ) |
|
{ |
|
return __vmsum4fp( a, b ); |
|
} |
|
|
|
FORCEINLINE fltx4 SinSIMD( const fltx4 &radians ) |
|
{ |
|
return XMVectorSin( radians ); |
|
} |
|
|
|
FORCEINLINE void SinCos3SIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians ) |
|
{ |
|
XMVectorSinCos( &sine, &cosine, radians ); |
|
} |
|
|
|
FORCEINLINE void SinCosSIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians ) |
|
{ |
|
XMVectorSinCos( &sine, &cosine, radians ); |
|
} |
|
|
|
FORCEINLINE void CosSIMD( fltx4 &cosine, const fltx4 &radians ) |
|
{ |
|
cosine = XMVectorCos( radians ); |
|
} |
|
|
|
FORCEINLINE fltx4 ArcSinSIMD( const fltx4 &sine ) |
|
{ |
|
return XMVectorASin( sine ); |
|
} |
|
|
|
FORCEINLINE fltx4 ArcCosSIMD( const fltx4 &cs ) |
|
{ |
|
return XMVectorACos( cs ); |
|
} |
|
|
|
// tan^1(a/b) .. ie, pass sin in as a and cos in as b |
|
FORCEINLINE fltx4 ArcTan2SIMD( const fltx4 &a, const fltx4 &b ) |
|
{ |
|
return XMVectorATan2( a, b ); |
|
} |
|
|
|
// DivSIMD defined further down, since it uses ReciprocalSIMD |
|
|
|
FORCEINLINE fltx4 MaxSIMD( const fltx4 & a, const fltx4 & b ) // max(a,b) |
|
{ |
|
return __vmaxfp( a, b ); |
|
} |
|
|
|
FORCEINLINE fltx4 MinSIMD( const fltx4 & a, const fltx4 & b ) // min(a,b) |
|
{ |
|
return __vminfp( a, b ); |
|
} |
|
|
|
FORCEINLINE fltx4 AndSIMD( const fltx4 & a, const fltx4 & b ) // a & b |
|
{ |
|
return __vand( a, b ); |
|
} |
|
|
|
FORCEINLINE fltx4 AndNotSIMD( const fltx4 & a, const fltx4 & b ) // ~a & b |
|
{ |
|
// NOTE: a and b are swapped in the call: SSE complements the first argument, VMX the second |
|
return __vandc( b, a ); |
|
} |
|
|
|
FORCEINLINE fltx4 XorSIMD( const fltx4 & a, const fltx4 & b ) // a ^ b |
|
{ |
|
return __vxor( a, b ); |
|
} |
|
|
|
FORCEINLINE fltx4 OrSIMD( const fltx4 & a, const fltx4 & b ) // a | b |
|
{ |
|
return __vor( a, b ); |
|
} |
|
|
|
FORCEINLINE fltx4 NegSIMD(const fltx4 &a) // negate: -a |
|
{ |
|
return XMVectorNegate(a); |
|
} |
|
|
|
FORCEINLINE bool IsAllZeros( const fltx4 & a ) // all floats of a zero? |
|
{ |
|
unsigned int equalFlags = 0; |
|
__vcmpeqfpR( a, Four_Zeros, &equalFlags ); |
|
return XMComparisonAllTrue( equalFlags ); |
|
} |
|
|
|
FORCEINLINE bool IsAnyZeros( const fltx4 & a ) // any floats are zero? |
|
{ |
|
unsigned int conditionregister; |
|
XMVectorEqualR(&conditionregister, a, XMVectorZero()); |
|
return XMComparisonAnyTrue(conditionregister); |
|
} |
|
|
|
FORCEINLINE bool IsAnyXYZZero( const fltx4 &a ) // are any of x,y,z zero? |
|
{ |
|
// copy a's x component into w, in case w was zero. |
|
fltx4 temp = __vrlimi(a, a, 1, 1); |
|
unsigned int conditionregister; |
|
XMVectorEqualR(&conditionregister, temp, XMVectorZero()); |
|
return XMComparisonAnyTrue(conditionregister); |
|
} |
|
|
|
// for branching when a.xyzw > b.xyzw |
|
FORCEINLINE bool IsAllGreaterThan( const fltx4 &a, const fltx4 &b ) |
|
{ |
|
unsigned int cr; |
|
XMVectorGreaterR(&cr,a,b); |
|
return XMComparisonAllTrue(cr); |
|
} |
|
|
|
// for branching when a.xyzw >= b.xyzw |
|
FORCEINLINE bool IsAllGreaterThanOrEq( const fltx4 &a, const fltx4 &b ) |
|
{ |
|
unsigned int cr; |
|
XMVectorGreaterOrEqualR(&cr,a,b); |
|
return XMComparisonAllTrue(cr); |
|
} |
|
|
|
// For branching if all a.xyzw == b.xyzw |
|
FORCEINLINE bool IsAllEqual( const fltx4 & a, const fltx4 & b ) |
|
{ |
|
unsigned int cr; |
|
XMVectorEqualR(&cr,a,b); |
|
return XMComparisonAllTrue(cr); |
|
} |
|
|
|
|
|
FORCEINLINE int TestSignSIMD( const fltx4 & a ) // mask of which floats have the high bit set |
|
{ |
|
// NOTE: this maps to SSE way better than it does to VMX (most code uses IsAnyNegative(), though) |
|
int nRet = 0; |
|
|
|
const fltx4_union & a_union = (const fltx4_union &)a; |
|
nRet |= ( a_union.m128_u32[0] & 0x80000000 ) >> 31; // sign(x) -> bit 0 |
|
nRet |= ( a_union.m128_u32[1] & 0x80000000 ) >> 30; // sign(y) -> bit 1 |
|
nRet |= ( a_union.m128_u32[2] & 0x80000000 ) >> 29; // sign(z) -> bit 2 |
|
nRet |= ( a_union.m128_u32[3] & 0x80000000 ) >> 28; // sign(w) -> bit 3 |
|
|
|
return nRet; |
|
} |
|
|
|
// Squelch the w component of a vector to +0.0. |
|
// Most efficient when you say a = SetWToZeroSIMD(a) (avoids a copy) |
|
FORCEINLINE fltx4 SetWToZeroSIMD( const fltx4 & a ) |
|
{ |
|
return __vrlimi( a, __vzero(), 1, 0 ); |
|
} |
|
|
|
FORCEINLINE bool IsAnyNegative( const fltx4 & a ) // (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0) |
|
{ |
|
// NOTE: this tests the top bits of each vector element using integer math |
|
// (so it ignores NaNs - it will return true for "-NaN") |
|
unsigned int equalFlags = 0; |
|
fltx4 signMask = __vspltisw( -1 ); // 0xFFFFFFFF 0xFFFFFFFF 0xFFFFFFFF 0xFFFFFFFF (low order 5 bits of each element = 31) |
|
signMask = __vslw( signMask, signMask ); // 0x80000000 0x80000000 0x80000000 0x80000000 |
|
__vcmpequwR( Four_Zeros, __vand( signMask, a ), &equalFlags ); |
|
return !XMComparisonAllTrue( equalFlags ); |
|
} |
|
|
|
FORCEINLINE fltx4 CmpEqSIMD( const fltx4 & a, const fltx4 & b ) // (a==b) ? ~0:0 |
|
{ |
|
return __vcmpeqfp( a, b ); |
|
} |
|
|
|
|
|
FORCEINLINE fltx4 CmpGtSIMD( const fltx4 & a, const fltx4 & b ) // (a>b) ? ~0:0 |
|
{ |
|
return __vcmpgtfp( a, b ); |
|
} |
|
|
|
FORCEINLINE fltx4 CmpGeSIMD( const fltx4 & a, const fltx4 & b ) // (a>=b) ? ~0:0 |
|
{ |
|
return __vcmpgefp( a, b ); |
|
} |
|
|
|
FORCEINLINE fltx4 CmpLtSIMD( const fltx4 & a, const fltx4 & b ) // (a<b) ? ~0:0 |
|
{ |
|
return __vcmpgtfp( b, a ); |
|
} |
|
|
|
FORCEINLINE fltx4 CmpLeSIMD( const fltx4 & a, const fltx4 & b ) // (a<=b) ? ~0:0 |
|
{ |
|
return __vcmpgefp( b, a ); |
|
} |
|
|
|
FORCEINLINE fltx4 CmpInBoundsSIMD( const fltx4 & a, const fltx4 & b ) // (a <= b && a >= -b) ? ~0 : 0 |
|
{ |
|
return XMVectorInBounds( a, b ); |
|
} |
|
|
|
// returned[i] = ReplacementMask[i] == 0 ? OldValue : NewValue |
|
FORCEINLINE fltx4 MaskedAssign( const fltx4 & ReplacementMask, const fltx4 & NewValue, const fltx4 & OldValue ) |
|
{ |
|
return __vsel( OldValue, NewValue, ReplacementMask ); |
|
} |
|
|
|
// AKA "Broadcast", "Splat" |
|
FORCEINLINE fltx4 ReplicateX4( float flValue ) // a,a,a,a |
|
{ |
|
// NOTE: if flValue comes from a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!) |
|
float * pValue = &flValue; |
|
Assert( pValue ); |
|
Assert( ((unsigned int)pValue & 3) == 0); |
|
return __vspltw( __lvlx( pValue, 0 ), 0 ); |
|
} |
|
|
|
FORCEINLINE fltx4 ReplicateX4( const float *pValue ) // a,a,a,a |
|
{ |
|
Assert( pValue ); |
|
return __vspltw( __lvlx( pValue, 0 ), 0 ); |
|
} |
|
|
|
/// replicate a single 32 bit integer value to all 4 components of an m128 |
|
FORCEINLINE fltx4 ReplicateIX4( int nValue ) |
|
{ |
|
// NOTE: if nValue comes from a register, this causes a Load-Hit-Store stall (should not mix ints with fltx4s!) |
|
int * pValue = &nValue; |
|
Assert( pValue ); |
|
Assert( ((unsigned int)pValue & 3) == 0); |
|
return __vspltw( __lvlx( pValue, 0 ), 0 ); |
|
} |
|
|
|
// Round towards positive infinity |
|
FORCEINLINE fltx4 CeilSIMD( const fltx4 &a ) |
|
{ |
|
return __vrfip(a); |
|
} |
|
|
|
// Round towards nearest integer |
|
FORCEINLINE fltx4 RoundSIMD( const fltx4 &a ) |
|
{ |
|
return __vrfin(a); |
|
} |
|
|
|
// Round towards negative infinity |
|
FORCEINLINE fltx4 FloorSIMD( const fltx4 &a ) |
|
{ |
|
return __vrfim(a); |
|
} |
|
|
|
FORCEINLINE fltx4 SqrtEstSIMD( const fltx4 & a ) // sqrt(a), more or less |
|
{ |
|
// This is emulated from rsqrt |
|
return XMVectorSqrtEst( a ); |
|
} |
|
|
|
FORCEINLINE fltx4 SqrtSIMD( const fltx4 & a ) // sqrt(a) |
|
{ |
|
// This is emulated from rsqrt |
|
return XMVectorSqrt( a ); |
|
} |
|
|
|
FORCEINLINE fltx4 ReciprocalSqrtEstSIMD( const fltx4 & a ) // 1/sqrt(a), more or less |
|
{ |
|
return __vrsqrtefp( a ); |
|
} |
|
|
|
FORCEINLINE fltx4 ReciprocalSqrtEstSaturateSIMD( const fltx4 & a ) |
|
{ |
|
// Convert zeros to epsilons |
|
fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros ); |
|
fltx4 a_safe = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) ); |
|
return ReciprocalSqrtEstSIMD( a_safe ); |
|
} |
|
|
|
FORCEINLINE fltx4 ReciprocalSqrtSIMD( const fltx4 & a ) // 1/sqrt(a) |
|
{ |
|
// This uses Newton-Raphson to improve the HW result |
|
return XMVectorReciprocalSqrt( a ); |
|
} |
|
|
|
FORCEINLINE fltx4 ReciprocalEstSIMD( const fltx4 & a ) // 1/a, more or less |
|
{ |
|
return __vrefp( a ); |
|
} |
|
|
|
/// 1/x for all 4 values. uses reciprocal approximation instruction plus newton iteration. |
|
/// No error checking! |
|
FORCEINLINE fltx4 ReciprocalSIMD( const fltx4 & a ) // 1/a |
|
{ |
|
// This uses Newton-Raphson to improve the HW result |
|
return XMVectorReciprocal( a ); |
|
} |
|
|
|
// FIXME: on 360, this is very slow, since it uses ReciprocalSIMD (do we need DivEstSIMD?) |
|
FORCEINLINE fltx4 DivSIMD( const fltx4 & a, const fltx4 & b ) // a/b |
|
{ |
|
return MulSIMD( ReciprocalSIMD( b ), a ); |
|
} |
|
|
|
/// 1/x for all 4 values. |
|
/// 1/0 will result in a big but NOT infinite result |
|
FORCEINLINE fltx4 ReciprocalEstSaturateSIMD( const fltx4 & a ) |
|
{ |
|
// Convert zeros to epsilons |
|
fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros ); |
|
fltx4 a_safe = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) ); |
|
return ReciprocalEstSIMD( a_safe ); |
|
} |
|
|
|
FORCEINLINE fltx4 ReciprocalSaturateSIMD( const fltx4 & a ) |
|
{ |
|
// Convert zeros to epsilons |
|
fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros ); |
|
fltx4 a_safe = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) ); |
|
return ReciprocalSIMD( a_safe ); |
|
|
|
// FIXME: This could be faster (BUT: it doesn't preserve the sign of -0.0, whereas the above does) |
|
// fltx4 zeroMask = CmpEqSIMD( Four_Zeros, a ); |
|
// fltx4 a_safe = XMVectorSelect( a, Four_Epsilons, zeroMask ); |
|
// return ReciprocalSIMD( a_safe ); |
|
} |
|
|
|
// CHRISG: is it worth doing integer bitfiddling for this? |
|
// 2^x for all values (the antilog) |
|
FORCEINLINE fltx4 ExpSIMD( const fltx4 &toPower ) |
|
{ |
|
return XMVectorExp(toPower); |
|
} |
|
|
|
// Clamps the components of a vector to a specified minimum and maximum range. |
|
FORCEINLINE fltx4 ClampVectorSIMD( FLTX4 in, FLTX4 min, FLTX4 max) |
|
{ |
|
return XMVectorClamp(in, min, max); |
|
} |
|
|
|
FORCEINLINE fltx4 LoadUnalignedSIMD( const void *pSIMD ) |
|
{ |
|
return XMLoadVector4( pSIMD ); |
|
} |
|
|
|
// load a 3-vector (as opposed to LoadUnalignedSIMD, which loads a 4-vec). |
|
FORCEINLINE fltx4 LoadUnaligned3SIMD( const void *pSIMD ) |
|
{ |
|
return XMLoadVector3( pSIMD ); |
|
} |
|
|
|
FORCEINLINE fltx4 LoadAlignedSIMD( const void *pSIMD ) |
|
{ |
|
return *( reinterpret_cast< const fltx4 *> ( pSIMD ) ); |
|
} |
|
|
|
// for the transitional class -- load a 3-by VectorAligned and squash its w component |
|
FORCEINLINE fltx4 LoadAlignedSIMD( const VectorAligned & pSIMD ) |
|
{ |
|
fltx4 out = XMLoadVector3A(pSIMD.Base()); |
|
// squelch w |
|
return __vrlimi( out, __vzero(), 1, 0 ); |
|
} |
|
|
|
// for the transitional class -- load a 3-by VectorAligned and squash its w component |
|
FORCEINLINE fltx4 LoadAlignedSIMD( const VectorAligned * RESTRICT pSIMD ) |
|
{ |
|
fltx4 out = XMLoadVector3A(pSIMD); |
|
// squelch w |
|
return __vrlimi( out, __vzero(), 1, 0 ); |
|
} |
|
|
|
FORCEINLINE void StoreAlignedSIMD( float *pSIMD, const fltx4 & a ) |
|
{ |
|
*( reinterpret_cast< fltx4 *> ( pSIMD ) ) = a; |
|
} |
|
|
|
FORCEINLINE void StoreUnalignedSIMD( float *pSIMD, const fltx4 & a ) |
|
{ |
|
XMStoreVector4( pSIMD, a ); |
|
} |
|
|
|
FORCEINLINE void StoreUnaligned3SIMD( float *pSIMD, const fltx4 & a ) |
|
{ |
|
XMStoreVector3( pSIMD, a ); |
|
} |
|
|
|
|
|
// strongly typed -- for typechecking as we transition to SIMD |
|
FORCEINLINE void StoreAligned3SIMD( VectorAligned * RESTRICT pSIMD, const fltx4 & a ) |
|
{ |
|
XMStoreVector3A(pSIMD->Base(),a); |
|
} |
|
|
|
|
|
// Fixed-point conversion and save as SIGNED INTS. |
|
// pDest->x = Int (vSrc.x) |
|
// note: some architectures have means of doing |
|
// fixed point conversion when the fix depth is |
|
// specified as an immediate.. but there is no way |
|
// to guarantee an immediate as a parameter to function |
|
// like this. |
|
FORCEINLINE void ConvertStoreAsIntsSIMD(intx4 * RESTRICT pDest, const fltx4 &vSrc) |
|
{ |
|
fltx4 asInt = __vctsxs( vSrc, 0 ); |
|
XMStoreVector4A(pDest->Base(), asInt); |
|
} |
|
|
|
FORCEINLINE void TransposeSIMD( fltx4 & x, fltx4 & y, fltx4 & z, fltx4 & w ) |
|
{ |
|
XMMATRIX xyzwMatrix = _XMMATRIX( x, y, z, w ); |
|
xyzwMatrix = XMMatrixTranspose( xyzwMatrix ); |
|
x = xyzwMatrix.r[0]; |
|
y = xyzwMatrix.r[1]; |
|
z = xyzwMatrix.r[2]; |
|
w = xyzwMatrix.r[3]; |
|
} |
|
|
|
// Return one in the fastest way -- faster even than loading. |
|
FORCEINLINE fltx4 LoadZeroSIMD( void ) |
|
{ |
|
return XMVectorZero(); |
|
} |
|
|
|
// Return one in the fastest way -- faster even than loading. |
|
FORCEINLINE fltx4 LoadOneSIMD( void ) |
|
{ |
|
return XMVectorSplatOne(); |
|
} |
|
|
|
FORCEINLINE fltx4 SplatXSIMD( fltx4 a ) |
|
{ |
|
return XMVectorSplatX( a ); |
|
} |
|
|
|
FORCEINLINE fltx4 SplatYSIMD( fltx4 a ) |
|
{ |
|
return XMVectorSplatY( a ); |
|
} |
|
|
|
FORCEINLINE fltx4 SplatZSIMD( fltx4 a ) |
|
{ |
|
return XMVectorSplatZ( a ); |
|
} |
|
|
|
FORCEINLINE fltx4 SplatWSIMD( fltx4 a ) |
|
{ |
|
return XMVectorSplatW( a ); |
|
} |
|
|
|
FORCEINLINE fltx4 SetXSIMD( const fltx4& a, const fltx4& x ) |
|
{ |
|
fltx4 result = __vrlimi(a, x, 8, 0); |
|
return result; |
|
} |
|
|
|
FORCEINLINE fltx4 SetYSIMD( const fltx4& a, const fltx4& y ) |
|
{ |
|
fltx4 result = __vrlimi(a, y, 4, 0); |
|
return result; |
|
} |
|
|
|
FORCEINLINE fltx4 SetZSIMD( const fltx4& a, const fltx4& z ) |
|
{ |
|
fltx4 result = __vrlimi(a, z, 2, 0); |
|
return result; |
|
} |
|
|
|
FORCEINLINE fltx4 SetWSIMD( const fltx4& a, const fltx4& w ) |
|
{ |
|
fltx4 result = __vrlimi(a, w, 1, 0); |
|
return result; |
|
} |
|
|
|
FORCEINLINE fltx4 SetComponentSIMD( const fltx4& a, int nComponent, float flValue ) |
|
{ |
|
static int s_nVrlimiMask[4] = { 8, 4, 2, 1 }; |
|
fltx4 val = ReplicateX4( flValue ); |
|
fltx4 result = __vrlimi(a, val, s_nVrlimiMask[nComponent], 0); |
|
return result; |
|
} |
|
|
|
FORCEINLINE fltx4 RotateLeft( const fltx4 & a ) |
|
{ |
|
fltx4 compareOne = a; |
|
return __vrlimi( compareOne, a, 8 | 4 | 2 | 1, 1 ); |
|
} |
|
|
|
FORCEINLINE fltx4 RotateLeft2( const fltx4 & a ) |
|
{ |
|
fltx4 compareOne = a; |
|
return __vrlimi( compareOne, a, 8 | 4 | 2 | 1, 2 ); |
|
} |
|
|
|
|
|
|
|
// find the lowest component of a.x, a.y, a.z, |
|
// and replicate it to the whole return value. |
|
// ignores a.w. |
|
// Though this is only five instructions long, |
|
// they are all dependent, making this stall city. |
|
// Forcing this inline should hopefully help with scheduling. |
|
FORCEINLINE fltx4 FindLowestSIMD3( const fltx4 & a ) |
|
{ |
|
// a is [x,y,z,G] (where G is garbage) |
|
// rotate left by one |
|
fltx4 compareOne = a ; |
|
compareOne = __vrlimi( compareOne, a, 8 | 4 , 1 ); |
|
// compareOne is [y,z,G,G] |
|
fltx4 retval = MinSIMD( a, compareOne ); |
|
// retVal is [min(x,y), min(y,z), G, G] |
|
compareOne = __vrlimi( compareOne, a, 8 , 2); |
|
// compareOne is [z, G, G, G] |
|
retval = MinSIMD( retval, compareOne ); |
|
// retVal = [ min(min(x,y),z), G, G, G ] |
|
|
|
// splat the x component out to the whole vector and return |
|
return SplatXSIMD( retval ); |
|
} |
|
|
|
// find the highest component of a.x, a.y, a.z, |
|
// and replicate it to the whole return value. |
|
// ignores a.w. |
|
// Though this is only five instructions long, |
|
// they are all dependent, making this stall city. |
|
// Forcing this inline should hopefully help with scheduling. |
|
FORCEINLINE fltx4 FindHighestSIMD3( const fltx4 & a ) |
|
{ |
|
// a is [x,y,z,G] (where G is garbage) |
|
// rotate left by one |
|
fltx4 compareOne = a ; |
|
compareOne = __vrlimi( compareOne, a, 8 | 4 , 1 ); |
|
// compareOne is [y,z,G,G] |
|
fltx4 retval = MaxSIMD( a, compareOne ); |
|
// retVal is [max(x,y), max(y,z), G, G] |
|
compareOne = __vrlimi( compareOne, a, 8 , 2); |
|
// compareOne is [z, G, G, G] |
|
retval = MaxSIMD( retval, compareOne ); |
|
// retVal = [ max(max(x,y),z), G, G, G ] |
|
|
|
// splat the x component out to the whole vector and return |
|
return SplatXSIMD( retval ); |
|
} |
|
|
|
|
|
// Transform many (horizontal) points in-place by a 3x4 matrix, |
|
// here already loaded onto three fltx4 registers. |
|
// The points must be stored as 16-byte aligned. They are points |
|
// and not vectors because we assume the w-component to be 1. |
|
// To spare yourself the annoyance of loading the matrix yourself, |
|
// use one of the overloads below. |
|
void TransformManyPointsBy(VectorAligned * RESTRICT pVectors, unsigned int numVectors, FLTX4 mRow1, FLTX4 mRow2, FLTX4 mRow3); |
|
|
|
// Transform many (horizontal) points in-place by a 3x4 matrix. |
|
// The points must be stored as 16-byte aligned. They are points |
|
// and not vectors because we assume the w-component to be 1. |
|
// In this function, the matrix need not be aligned. |
|
FORCEINLINE void TransformManyPointsBy(VectorAligned * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t &pMatrix) |
|
{ |
|
return TransformManyPointsBy(pVectors, numVectors, |
|
LoadUnalignedSIMD( pMatrix[0] ), LoadUnalignedSIMD( pMatrix[1] ), LoadUnalignedSIMD( pMatrix[2] ) ); |
|
} |
|
|
|
// Transform many (horizontal) points in-place by a 3x4 matrix. |
|
// The points must be stored as 16-byte aligned. They are points |
|
// and not vectors because we assume the w-component to be 1. |
|
// In this function, the matrix must itself be aligned on a 16-byte |
|
// boundary. |
|
FORCEINLINE void TransformManyPointsByA(VectorAligned * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t &pMatrix) |
|
{ |
|
return TransformManyPointsBy(pVectors, numVectors, |
|
LoadAlignedSIMD( pMatrix[0] ), LoadAlignedSIMD( pMatrix[1] ), LoadAlignedSIMD( pMatrix[2] ) ); |
|
} |
|
|
|
// ------------------------------------ |
|
// INTEGER SIMD OPERATIONS. |
|
// ------------------------------------ |
|
|
|
// Load 4 aligned words into a SIMD register |
|
FORCEINLINE i32x4 LoadAlignedIntSIMD( const void * RESTRICT pSIMD) |
|
{ |
|
return XMLoadVector4A(pSIMD); |
|
} |
|
|
|
// Load 4 unaligned words into a SIMD register |
|
FORCEINLINE i32x4 LoadUnalignedIntSIMD(const void * RESTRICT pSIMD) |
|
{ |
|
return XMLoadVector4( pSIMD ); |
|
} |
|
|
|
// save into four words, 16-byte aligned |
|
FORCEINLINE void StoreAlignedIntSIMD( int32 *pSIMD, const fltx4 & a ) |
|
{ |
|
*( reinterpret_cast< i32x4 *> ( pSIMD ) ) = a; |
|
} |
|
|
|
FORCEINLINE void StoreAlignedIntSIMD( intx4 &pSIMD, const fltx4 & a ) |
|
{ |
|
*( reinterpret_cast< i32x4 *> ( pSIMD.Base() ) ) = a; |
|
} |
|
|
|
FORCEINLINE void StoreUnalignedIntSIMD( int32 *pSIMD, const fltx4 & a ) |
|
{ |
|
XMStoreVector4(pSIMD, a); |
|
} |
|
|
|
|
|
// Take a fltx4 containing fixed-point uints and |
|
// return them as single precision floats. No |
|
// fixed point conversion is done. |
|
FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const i32x4 &vSrcA ) |
|
{ |
|
return __vcfux( vSrcA, 0 ); |
|
} |
|
|
|
|
|
// Take a fltx4 containing fixed-point sints and |
|
// return them as single precision floats. No |
|
// fixed point conversion is done. |
|
FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA ) |
|
{ |
|
return __vcfsx( vSrcA, 0 ); |
|
} |
|
|
|
// Take a fltx4 containing fixed-point uints and |
|
// return them as single precision floats. Each uint |
|
// will be divided by 2^immed after conversion |
|
// (eg, this is fixed point math). |
|
/* as if: |
|
FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const i32x4 &vSrcA, unsigned int uImmed ) |
|
{ |
|
return __vcfux( vSrcA, uImmed ); |
|
} |
|
*/ |
|
#define UnsignedFixedIntConvertToFltSIMD(vSrcA, uImmed) (__vcfux( (vSrcA), (uImmed) )) |
|
|
|
// Take a fltx4 containing fixed-point sints and |
|
// return them as single precision floats. Each int |
|
// will be divided by 2^immed (eg, this is fixed point |
|
// math). |
|
/* as if: |
|
FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA, unsigned int uImmed ) |
|
{ |
|
return __vcfsx( vSrcA, uImmed ); |
|
} |
|
*/ |
|
#define SignedFixedIntConvertToFltSIMD(vSrcA, uImmed) (__vcfsx( (vSrcA), (uImmed) )) |
|
|
|
// set all components of a vector to a signed immediate int number. |
|
/* as if: |
|
FORCEINLINE fltx4 IntSetImmediateSIMD(int toImmediate) |
|
{ |
|
return __vspltisw( toImmediate ); |
|
} |
|
*/ |
|
#define IntSetImmediateSIMD(x) (__vspltisw(x)) |
|
|
|
/* |
|
works on fltx4's as if they are four uints. |
|
the first parameter contains the words to be shifted, |
|
the second contains the amount to shift by AS INTS |
|
|
|
for i = 0 to 3 |
|
shift = vSrcB_i*32:(i*32)+4 |
|
vReturned_i*32:(i*32)+31 = vSrcA_i*32:(i*32)+31 << shift |
|
*/ |
|
FORCEINLINE fltx4 IntShiftLeftWordSIMD(fltx4 vSrcA, fltx4 vSrcB) |
|
{ |
|
return __vslw(vSrcA, vSrcB); |
|
} |
|
|
|
FORCEINLINE float SubFloat( const fltx4 & a, int idx ) |
|
{ |
|
// NOTE: if the output goes into a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!) |
|
const fltx4_union & a_union = (const fltx4_union &)a; |
|
return a_union.m128_f32[ idx ]; |
|
} |
|
|
|
FORCEINLINE float & SubFloat( fltx4 & a, int idx ) |
|
{ |
|
fltx4_union & a_union = (fltx4_union &)a; |
|
return a_union.m128_f32[idx]; |
|
} |
|
|
|
FORCEINLINE uint32 SubFloatConvertToInt( const fltx4 & a, int idx ) |
|
{ |
|
fltx4 t = __vctuxs( a, 0 ); |
|
const fltx4_union & a_union = (const fltx4_union &)t; |
|
return a_union.m128_u32[idx]; |
|
} |
|
|
|
|
|
FORCEINLINE uint32 SubInt( const fltx4 & a, int idx ) |
|
{ |
|
const fltx4_union & a_union = (const fltx4_union &)a; |
|
return a_union.m128_u32[idx]; |
|
} |
|
|
|
FORCEINLINE uint32 & SubInt( fltx4 & a, int idx ) |
|
{ |
|
fltx4_union & a_union = (fltx4_union &)a; |
|
return a_union.m128_u32[idx]; |
|
} |
|
|
|
#else |
|
|
|
//--------------------------------------------------------------------- |
|
// Intel/SSE implementation |
|
//--------------------------------------------------------------------- |
|
|
|
FORCEINLINE void StoreAlignedSIMD( float * RESTRICT pSIMD, const fltx4 & a ) |
|
{ |
|
_mm_store_ps( pSIMD, a ); |
|
} |
|
|
|
FORCEINLINE void StoreUnalignedSIMD( float * RESTRICT pSIMD, const fltx4 & a ) |
|
{ |
|
_mm_storeu_ps( pSIMD, a ); |
|
} |
|
|
|
|
|
FORCEINLINE fltx4 RotateLeft( const fltx4 & a ); |
|
FORCEINLINE fltx4 RotateLeft2( const fltx4 & a ); |
|
|
|
FORCEINLINE void StoreUnaligned3SIMD( float *pSIMD, const fltx4 & a ) |
|
{ |
|
_mm_store_ss(pSIMD, a); |
|
_mm_store_ss(pSIMD+1, RotateLeft(a)); |
|
_mm_store_ss(pSIMD+2, RotateLeft2(a)); |
|
} |
|
|
|
// strongly typed -- syntactic castor oil used for typechecking as we transition to SIMD |
|
FORCEINLINE void StoreAligned3SIMD( VectorAligned * RESTRICT pSIMD, const fltx4 & a ) |
|
{ |
|
StoreAlignedSIMD( pSIMD->Base(),a ); |
|
} |
|
|
|
FORCEINLINE fltx4 LoadAlignedSIMD( const void *pSIMD ) |
|
{ |
|
return _mm_load_ps( reinterpret_cast< const float *> ( pSIMD ) ); |
|
} |
|
|
|
FORCEINLINE fltx4 AndSIMD( const fltx4 & a, const fltx4 & b ) // a & b |
|
{ |
|
return _mm_and_ps( a, b ); |
|
} |
|
|
|
FORCEINLINE fltx4 AndNotSIMD( const fltx4 & a, const fltx4 & b ) // ~a & b |
|
{ |
|
return _mm_andnot_ps( a, b ); |
|
} |
|
|
|
FORCEINLINE fltx4 XorSIMD( const fltx4 & a, const fltx4 & b ) // a ^ b |
|
{ |
|
return _mm_xor_ps( a, b ); |
|
} |
|
|
|
FORCEINLINE fltx4 OrSIMD( const fltx4 & a, const fltx4 & b ) // a | b |
|
{ |
|
return _mm_or_ps( a, b ); |
|
} |
|
|
|
// Squelch the w component of a vector to +0.0. |
|
// Most efficient when you say a = SetWToZeroSIMD(a) (avoids a copy) |
|
FORCEINLINE fltx4 SetWToZeroSIMD( const fltx4 & a ) |
|
{ |
|
return AndSIMD( a, LoadAlignedSIMD( g_SIMD_clear_wmask ) ); |
|
} |
|
|
|
// for the transitional class -- load a 3-by VectorAligned and squash its w component |
|
FORCEINLINE fltx4 LoadAlignedSIMD( const VectorAligned & pSIMD ) |
|
{ |
|
return SetWToZeroSIMD( LoadAlignedSIMD(pSIMD.Base()) ); |
|
} |
|
|
|
#ifdef __SANITIZE_ADDRESS__ |
|
static __attribute__((no_sanitize("address"))) fltx4 LoadUnalignedSIMD( const void *pSIMD ) |
|
{ |
|
return _mm_loadu_ps( reinterpret_cast<const float *>( pSIMD ) ); |
|
|
|
} |
|
|
|
static __attribute__((no_sanitize("address"))) fltx4 LoadUnaligned3SIMD( const void *pSIMD ) |
|
{ |
|
return _mm_loadu_ps( reinterpret_cast<const float *>( pSIMD ) ); |
|
} |
|
#else |
|
FORCEINLINE fltx4 LoadUnalignedSIMD( const void *pSIMD ) |
|
{ |
|
return _mm_loadu_ps( reinterpret_cast<const float *>( pSIMD ) ); |
|
} |
|
|
|
FORCEINLINE fltx4 LoadUnaligned3SIMD( const void *pSIMD ) |
|
{ |
|
return _mm_loadu_ps( reinterpret_cast<const float *>( pSIMD ) ); |
|
} |
|
#endif |
|
|
|
/// replicate a single 32 bit integer value to all 4 components of an m128 |
|
FORCEINLINE fltx4 ReplicateIX4( int i ) |
|
{ |
|
fltx4 value = _mm_set_ss( * ( ( float *) &i ) );; |
|
return _mm_shuffle_ps( value, value, 0); |
|
} |
|
|
|
|
|
FORCEINLINE fltx4 ReplicateX4( float flValue ) |
|
{ |
|
__m128 value = _mm_set_ss( flValue ); |
|
return _mm_shuffle_ps( value, value, 0 ); |
|
} |
|
|
|
|
|
FORCEINLINE float SubFloat( const fltx4 & a, int idx ) |
|
{ |
|
// NOTE: if the output goes into a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!) |
|
#ifndef POSIX |
|
return a.m128_f32[ idx ]; |
|
#else |
|
return (reinterpret_cast<float const *>(&a))[idx]; |
|
#endif |
|
} |
|
|
|
FORCEINLINE float & SubFloat( fltx4 & a, int idx ) |
|
{ |
|
#ifndef POSIX |
|
return a.m128_f32[ idx ]; |
|
#else |
|
return (reinterpret_cast<float *>(&a))[idx]; |
|
#endif |
|
} |
|
|
|
FORCEINLINE uint32 SubFloatConvertToInt( const fltx4 & a, int idx ) |
|
{ |
|
return (uint32)SubFloat(a,idx); |
|
} |
|
|
|
FORCEINLINE uint32 SubInt( const fltx4 & a, int idx ) |
|
{ |
|
#ifndef POSIX |
|
return a.m128_u32[idx]; |
|
#else |
|
return (reinterpret_cast<uint32 const *>(&a))[idx]; |
|
#endif |
|
} |
|
|
|
FORCEINLINE uint32 & SubInt( fltx4 & a, int idx ) |
|
{ |
|
#ifndef POSIX |
|
return a.m128_u32[idx]; |
|
#else |
|
return (reinterpret_cast<uint32 *>(&a))[idx]; |
|
#endif |
|
} |
|
|
|
// Return one in the fastest way -- on the x360, faster even than loading. |
|
FORCEINLINE fltx4 LoadZeroSIMD( void ) |
|
{ |
|
return Four_Zeros; |
|
} |
|
|
|
// Return one in the fastest way -- on the x360, faster even than loading. |
|
FORCEINLINE fltx4 LoadOneSIMD( void ) |
|
{ |
|
return Four_Ones; |
|
} |
|
|
|
FORCEINLINE fltx4 MaskedAssign( const fltx4 & ReplacementMask, const fltx4 & NewValue, const fltx4 & OldValue ) |
|
{ |
|
return OrSIMD( |
|
AndSIMD( ReplacementMask, NewValue ), |
|
AndNotSIMD( ReplacementMask, OldValue ) ); |
|
} |
|
|
|
// remember, the SSE numbers its words 3 2 1 0 |
|
// The way we want to specify shuffles is backwards from the default |
|
// MM_SHUFFLE_REV is in array index order (default is reversed) |
|
#define MM_SHUFFLE_REV(a,b,c,d) _MM_SHUFFLE(d,c,b,a) |
|
|
|
FORCEINLINE fltx4 SplatXSIMD( fltx4 const & a ) |
|
{ |
|
return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 0, 0, 0, 0 ) ); |
|
} |
|
|
|
FORCEINLINE fltx4 SplatYSIMD( fltx4 const &a ) |
|
{ |
|
return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 1, 1, 1, 1 ) ); |
|
} |
|
|
|
FORCEINLINE fltx4 SplatZSIMD( fltx4 const &a ) |
|
{ |
|
return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 2, 2, 2, 2 ) ); |
|
} |
|
|
|
FORCEINLINE fltx4 SplatWSIMD( fltx4 const &a ) |
|
{ |
|
return _mm_shuffle_ps( a, a, _MM_SHUFFLE( 3, 3, 3, 3 ) ); |
|
} |
|
|
|
FORCEINLINE fltx4 SetXSIMD( const fltx4& a, const fltx4& x ) |
|
{ |
|
fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[0] ), x, a ); |
|
return result; |
|
} |
|
|
|
FORCEINLINE fltx4 SetYSIMD( const fltx4& a, const fltx4& y ) |
|
{ |
|
fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[1] ), y, a ); |
|
return result; |
|
} |
|
|
|
FORCEINLINE fltx4 SetZSIMD( const fltx4& a, const fltx4& z ) |
|
{ |
|
fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[2] ), z, a ); |
|
return result; |
|
} |
|
|
|
FORCEINLINE fltx4 SetWSIMD( const fltx4& a, const fltx4& w ) |
|
{ |
|
fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[3] ), w, a ); |
|
return result; |
|
} |
|
|
|
FORCEINLINE fltx4 SetComponentSIMD( const fltx4& a, int nComponent, float flValue ) |
|
{ |
|
fltx4 val = ReplicateX4( flValue ); |
|
fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[nComponent] ), val, a ); |
|
return result; |
|
} |
|
|
|
// a b c d -> b c d a |
|
FORCEINLINE fltx4 RotateLeft( const fltx4 & a ) |
|
{ |
|
return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 1, 2, 3, 0 ) ); |
|
} |
|
|
|
// a b c d -> c d a b |
|
FORCEINLINE fltx4 RotateLeft2( const fltx4 & a ) |
|
{ |
|
return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 2, 3, 0, 1 ) ); |
|
} |
|
|
|
// a b c d -> d a b c |
|
FORCEINLINE fltx4 RotateRight( const fltx4 & a ) |
|
{ |
|
return _mm_shuffle_ps( a, a, _MM_SHUFFLE( 0, 3, 2, 1) ); |
|
} |
|
|
|
// a b c d -> c d a b |
|
FORCEINLINE fltx4 RotateRight2( const fltx4 & a ) |
|
{ |
|
return _mm_shuffle_ps( a, a, _MM_SHUFFLE( 1, 0, 3, 2 ) ); |
|
} |
|
|
|
|
|
FORCEINLINE fltx4 AddSIMD( const fltx4 & a, const fltx4 & b ) // a+b |
|
{ |
|
return _mm_add_ps( a, b ); |
|
}; |
|
|
|
FORCEINLINE fltx4 SubSIMD( const fltx4 & a, const fltx4 & b ) // a-b |
|
{ |
|
return _mm_sub_ps( a, b ); |
|
}; |
|
|
|
FORCEINLINE fltx4 MulSIMD( const fltx4 & a, const fltx4 & b ) // a*b |
|
{ |
|
return _mm_mul_ps( a, b ); |
|
}; |
|
|
|
FORCEINLINE fltx4 DivSIMD( const fltx4 & a, const fltx4 & b ) // a/b |
|
{ |
|
return _mm_div_ps( a, b ); |
|
}; |
|
|
|
FORCEINLINE fltx4 MaddSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // a*b + c |
|
{ |
|
return AddSIMD( MulSIMD(a,b), c ); |
|
} |
|
|
|
FORCEINLINE fltx4 MsubSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // c - a*b |
|
{ |
|
return SubSIMD( c, MulSIMD(a,b) ); |
|
}; |
|
|
|
FORCEINLINE fltx4 Dot3SIMD( const fltx4 &a, const fltx4 &b ) |
|
{ |
|
fltx4 m = MulSIMD( a, b ); |
|
float flDot = SubFloat( m, 0 ) + SubFloat( m, 1 ) + SubFloat( m, 2 ); |
|
return ReplicateX4( flDot ); |
|
} |
|
|
|
FORCEINLINE fltx4 Dot4SIMD( const fltx4 &a, const fltx4 &b ) |
|
{ |
|
fltx4 m = MulSIMD( a, b ); |
|
float flDot = SubFloat( m, 0 ) + SubFloat( m, 1 ) + SubFloat( m, 2 ) + SubFloat( m, 3 ); |
|
return ReplicateX4( flDot ); |
|
} |
|
|
|
//TODO: implement as four-way Taylor series (see xbox implementation) |
|
FORCEINLINE fltx4 SinSIMD( const fltx4 &radians ) |
|
{ |
|
fltx4 result; |
|
SubFloat( result, 0 ) = sin( SubFloat( radians, 0 ) ); |
|
SubFloat( result, 1 ) = sin( SubFloat( radians, 1 ) ); |
|
SubFloat( result, 2 ) = sin( SubFloat( radians, 2 ) ); |
|
SubFloat( result, 3 ) = sin( SubFloat( radians, 3 ) ); |
|
return result; |
|
} |
|
|
|
FORCEINLINE void SinCos3SIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians ) |
|
{ |
|
// FIXME: Make a fast SSE version |
|
SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) ); |
|
SinCos( SubFloat( radians, 1 ), &SubFloat( sine, 1 ), &SubFloat( cosine, 1 ) ); |
|
SinCos( SubFloat( radians, 2 ), &SubFloat( sine, 2 ), &SubFloat( cosine, 2 ) ); |
|
} |
|
|
|
FORCEINLINE void SinCosSIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians ) // a*b + c |
|
{ |
|
// FIXME: Make a fast SSE version |
|
SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) ); |
|
SinCos( SubFloat( radians, 1 ), &SubFloat( sine, 1 ), &SubFloat( cosine, 1 ) ); |
|
SinCos( SubFloat( radians, 2 ), &SubFloat( sine, 2 ), &SubFloat( cosine, 2 ) ); |
|
SinCos( SubFloat( radians, 3 ), &SubFloat( sine, 3 ), &SubFloat( cosine, 3 ) ); |
|
} |
|
|
|
//TODO: implement as four-way Taylor series (see xbox implementation) |
|
FORCEINLINE fltx4 ArcSinSIMD( const fltx4 &sine ) |
|
{ |
|
// FIXME: Make a fast SSE version |
|
fltx4 result; |
|
SubFloat( result, 0 ) = asin( SubFloat( sine, 0 ) ); |
|
SubFloat( result, 1 ) = asin( SubFloat( sine, 1 ) ); |
|
SubFloat( result, 2 ) = asin( SubFloat( sine, 2 ) ); |
|
SubFloat( result, 3 ) = asin( SubFloat( sine, 3 ) ); |
|
return result; |
|
} |
|
|
|
FORCEINLINE fltx4 ArcCosSIMD( const fltx4 &cs ) |
|
{ |
|
fltx4 result; |
|
SubFloat( result, 0 ) = acos( SubFloat( cs, 0 ) ); |
|
SubFloat( result, 1 ) = acos( SubFloat( cs, 1 ) ); |
|
SubFloat( result, 2 ) = acos( SubFloat( cs, 2 ) ); |
|
SubFloat( result, 3 ) = acos( SubFloat( cs, 3 ) ); |
|
return result; |
|
} |
|
|
|
// tan^1(a/b) .. ie, pass sin in as a and cos in as b |
|
FORCEINLINE fltx4 ArcTan2SIMD( const fltx4 &a, const fltx4 &b ) |
|
{ |
|
fltx4 result; |
|
SubFloat( result, 0 ) = atan2( SubFloat( a, 0 ), SubFloat( b, 0 ) ); |
|
SubFloat( result, 1 ) = atan2( SubFloat( a, 1 ), SubFloat( b, 1 ) ); |
|
SubFloat( result, 2 ) = atan2( SubFloat( a, 2 ), SubFloat( b, 2 ) ); |
|
SubFloat( result, 3 ) = atan2( SubFloat( a, 3 ), SubFloat( b, 3 ) ); |
|
return result; |
|
} |
|
|
|
FORCEINLINE fltx4 NegSIMD(const fltx4 &a) // negate: -a |
|
{ |
|
return SubSIMD(LoadZeroSIMD(),a); |
|
} |
|
|
|
FORCEINLINE int TestSignSIMD( const fltx4 & a ) // mask of which floats have the high bit set |
|
{ |
|
return _mm_movemask_ps( a ); |
|
} |
|
|
|
FORCEINLINE bool IsAnyNegative( const fltx4 & a ) // (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0) |
|
{ |
|
return (0 != TestSignSIMD( a )); |
|
} |
|
|
|
FORCEINLINE fltx4 CmpEqSIMD( const fltx4 & a, const fltx4 & b ) // (a==b) ? ~0:0 |
|
{ |
|
return _mm_cmpeq_ps( a, b ); |
|
} |
|
|
|
FORCEINLINE fltx4 CmpGtSIMD( const fltx4 & a, const fltx4 & b ) // (a>b) ? ~0:0 |
|
{ |
|
return _mm_cmpgt_ps( a, b ); |
|
} |
|
|
|
FORCEINLINE fltx4 CmpGeSIMD( const fltx4 & a, const fltx4 & b ) // (a>=b) ? ~0:0 |
|
{ |
|
return _mm_cmpge_ps( a, b ); |
|
} |
|
|
|
FORCEINLINE fltx4 CmpLtSIMD( const fltx4 & a, const fltx4 & b ) // (a<b) ? ~0:0 |
|
{ |
|
return _mm_cmplt_ps( a, b ); |
|
} |
|
|
|
FORCEINLINE fltx4 CmpLeSIMD( const fltx4 & a, const fltx4 & b ) // (a<=b) ? ~0:0 |
|
{ |
|
return _mm_cmple_ps( a, b ); |
|
} |
|
|
|
// for branching when a.xyzw > b.xyzw |
|
FORCEINLINE bool IsAllGreaterThan( const fltx4 &a, const fltx4 &b ) |
|
{ |
|
return TestSignSIMD( CmpLeSIMD( a, b ) ) == 0; |
|
} |
|
|
|
// for branching when a.xyzw >= b.xyzw |
|
FORCEINLINE bool IsAllGreaterThanOrEq( const fltx4 &a, const fltx4 &b ) |
|
{ |
|
return TestSignSIMD( CmpLtSIMD( a, b ) ) == 0; |
|
} |
|
|
|
// For branching if all a.xyzw == b.xyzw |
|
FORCEINLINE bool IsAllEqual( const fltx4 & a, const fltx4 & b ) |
|
{ |
|
return TestSignSIMD( CmpEqSIMD( a, b ) ) == 0xf; |
|
} |
|
|
|
FORCEINLINE fltx4 CmpInBoundsSIMD( const fltx4 & a, const fltx4 & b ) // (a <= b && a >= -b) ? ~0 : 0 |
|
{ |
|
return AndSIMD( CmpLeSIMD(a,b), CmpGeSIMD(a, NegSIMD(b)) ); |
|
} |
|
|
|
FORCEINLINE fltx4 MinSIMD( const fltx4 & a, const fltx4 & b ) // min(a,b) |
|
{ |
|
return _mm_min_ps( a, b ); |
|
} |
|
|
|
FORCEINLINE fltx4 MaxSIMD( const fltx4 & a, const fltx4 & b ) // max(a,b) |
|
{ |
|
return _mm_max_ps( a, b ); |
|
} |
|
|
|
|
|
|
|
// SSE lacks rounding operations. |
|
// Really. |
|
// You can emulate them by setting the rounding mode for the |
|
// whole processor and then converting to int, and then back again. |
|
// But every time you set the rounding mode, you clear out the |
|
// entire pipeline. So, I can't do them per operation. You |
|
// have to do it once, before the loop that would call these. |
|
// Round towards positive infinity |
|
FORCEINLINE fltx4 CeilSIMD( const fltx4 &a ) |
|
{ |
|
fltx4 retVal; |
|
SubFloat( retVal, 0 ) = ceil( SubFloat( a, 0 ) ); |
|
SubFloat( retVal, 1 ) = ceil( SubFloat( a, 1 ) ); |
|
SubFloat( retVal, 2 ) = ceil( SubFloat( a, 2 ) ); |
|
SubFloat( retVal, 3 ) = ceil( SubFloat( a, 3 ) ); |
|
return retVal; |
|
|
|
} |
|
|
|
fltx4 fabs( const fltx4 & x ); |
|
// Round towards negative infinity |
|
// This is the implementation that was here before; it assumes |
|
// you are in round-to-floor mode, which I guess is usually the |
|
// case for us vis-a-vis SSE. It's totally unnecessary on |
|
// VMX, which has a native floor op. |
|
FORCEINLINE fltx4 FloorSIMD( const fltx4 &val ) |
|
{ |
|
fltx4 fl4Abs = fabs( val ); |
|
fltx4 ival = SubSIMD( AddSIMD( fl4Abs, Four_2ToThe23s ), Four_2ToThe23s ); |
|
ival = MaskedAssign( CmpGtSIMD( ival, fl4Abs ), SubSIMD( ival, Four_Ones ), ival ); |
|
return XorSIMD( ival, XorSIMD( val, fl4Abs ) ); // restore sign bits |
|
} |
|
|
|
|
|
|
|
inline bool IsAllZeros( const fltx4 & var ) |
|
{ |
|
return TestSignSIMD( CmpEqSIMD( var, Four_Zeros ) ) == 0xF; |
|
} |
|
|
|
FORCEINLINE fltx4 SqrtEstSIMD( const fltx4 & a ) // sqrt(a), more or less |
|
{ |
|
return _mm_sqrt_ps( a ); |
|
} |
|
|
|
FORCEINLINE fltx4 SqrtSIMD( const fltx4 & a ) // sqrt(a) |
|
{ |
|
return _mm_sqrt_ps( a ); |
|
} |
|
|
|
FORCEINLINE fltx4 ReciprocalSqrtEstSIMD( const fltx4 & a ) // 1/sqrt(a), more or less |
|
{ |
|
return _mm_rsqrt_ps( a ); |
|
} |
|
|
|
FORCEINLINE fltx4 ReciprocalSqrtEstSaturateSIMD( const fltx4 & a ) |
|
{ |
|
fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros ); |
|
fltx4 ret = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) ); |
|
ret = ReciprocalSqrtEstSIMD( ret ); |
|
return ret; |
|
} |
|
|
|
/// uses newton iteration for higher precision results than ReciprocalSqrtEstSIMD |
|
FORCEINLINE fltx4 ReciprocalSqrtSIMD( const fltx4 & a ) // 1/sqrt(a) |
|
{ |
|
fltx4 guess = ReciprocalSqrtEstSIMD( a ); |
|
// newton iteration for 1/sqrt(a) : y(n+1) = 1/2 (y(n)*(3-a*y(n)^2)); |
|
guess = MulSIMD( guess, SubSIMD( Four_Threes, MulSIMD( a, MulSIMD( guess, guess )))); |
|
guess = MulSIMD( Four_PointFives, guess); |
|
return guess; |
|
} |
|
|
|
FORCEINLINE fltx4 ReciprocalEstSIMD( const fltx4 & a ) // 1/a, more or less |
|
{ |
|
return _mm_rcp_ps( a ); |
|
} |
|
|
|
/// 1/x for all 4 values, more or less |
|
/// 1/0 will result in a big but NOT infinite result |
|
FORCEINLINE fltx4 ReciprocalEstSaturateSIMD( const fltx4 & a ) |
|
{ |
|
fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros ); |
|
fltx4 ret = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) ); |
|
ret = ReciprocalEstSIMD( ret ); |
|
return ret; |
|
} |
|
|
|
/// 1/x for all 4 values. uses reciprocal approximation instruction plus newton iteration. |
|
/// No error checking! |
|
FORCEINLINE fltx4 ReciprocalSIMD( const fltx4 & a ) // 1/a |
|
{ |
|
fltx4 ret = ReciprocalEstSIMD( a ); |
|
// newton iteration is: Y(n+1) = 2*Y(n)-a*Y(n)^2 |
|
ret = SubSIMD( AddSIMD( ret, ret ), MulSIMD( a, MulSIMD( ret, ret ) ) ); |
|
return ret; |
|
} |
|
|
|
/// 1/x for all 4 values. |
|
/// 1/0 will result in a big but NOT infinite result |
|
FORCEINLINE fltx4 ReciprocalSaturateSIMD( const fltx4 & a ) |
|
{ |
|
fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros ); |
|
fltx4 ret = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) ); |
|
ret = ReciprocalSIMD( ret ); |
|
return ret; |
|
} |
|
|
|
// CHRISG: is it worth doing integer bitfiddling for this? |
|
// 2^x for all values (the antilog) |
|
FORCEINLINE fltx4 ExpSIMD( const fltx4 &toPower ) |
|
{ |
|
fltx4 retval; |
|
SubFloat( retval, 0 ) = powf( 2, SubFloat(toPower, 0) ); |
|
SubFloat( retval, 1 ) = powf( 2, SubFloat(toPower, 1) ); |
|
SubFloat( retval, 2 ) = powf( 2, SubFloat(toPower, 2) ); |
|
SubFloat( retval, 3 ) = powf( 2, SubFloat(toPower, 3) ); |
|
|
|
return retval; |
|
} |
|
|
|
// Clamps the components of a vector to a specified minimum and maximum range. |
|
FORCEINLINE fltx4 ClampVectorSIMD( FLTX4 in, FLTX4 min, FLTX4 max) |
|
{ |
|
return MaxSIMD( min, MinSIMD( max, in ) ); |
|
} |
|
|
|
FORCEINLINE void TransposeSIMD( fltx4 & x, fltx4 & y, fltx4 & z, fltx4 & w) |
|
{ |
|
_MM_TRANSPOSE4_PS( x, y, z, w ); |
|
} |
|
|
|
FORCEINLINE fltx4 FindLowestSIMD3( const fltx4 &a ) |
|
{ |
|
// a is [x,y,z,G] (where G is garbage) |
|
// rotate left by one |
|
fltx4 compareOne = RotateLeft( a ); |
|
// compareOne is [y,z,G,x] |
|
fltx4 retval = MinSIMD( a, compareOne ); |
|
// retVal is [min(x,y), ... ] |
|
compareOne = RotateLeft2( a ); |
|
// compareOne is [z, G, x, y] |
|
retval = MinSIMD( retval, compareOne ); |
|
// retVal = [ min(min(x,y),z)..] |
|
// splat the x component out to the whole vector and return |
|
return SplatXSIMD( retval ); |
|
|
|
} |
|
|
|
FORCEINLINE fltx4 FindHighestSIMD3( const fltx4 &a ) |
|
{ |
|
// a is [x,y,z,G] (where G is garbage) |
|
// rotate left by one |
|
fltx4 compareOne = RotateLeft( a ); |
|
// compareOne is [y,z,G,x] |
|
fltx4 retval = MaxSIMD( a, compareOne ); |
|
// retVal is [max(x,y), ... ] |
|
compareOne = RotateLeft2( a ); |
|
// compareOne is [z, G, x, y] |
|
retval = MaxSIMD( retval, compareOne ); |
|
// retVal = [ max(max(x,y),z)..] |
|
// splat the x component out to the whole vector and return |
|
return SplatXSIMD( retval ); |
|
|
|
} |
|
|
|
// ------------------------------------ |
|
// INTEGER SIMD OPERATIONS. |
|
// ------------------------------------ |
|
|
|
|
|
#if 0 /* pc does not have these ops */ |
|
// splat all components of a vector to a signed immediate int number. |
|
FORCEINLINE fltx4 IntSetImmediateSIMD(int to) |
|
{ |
|
//CHRISG: SSE2 has this, but not SSE1. What to do? |
|
fltx4 retval; |
|
SubInt( retval, 0 ) = to; |
|
SubInt( retval, 1 ) = to; |
|
SubInt( retval, 2 ) = to; |
|
SubInt( retval, 3 ) = to; |
|
return retval; |
|
} |
|
#endif |
|
|
|
// Load 4 aligned words into a SIMD register |
|
FORCEINLINE i32x4 LoadAlignedIntSIMD( const void * RESTRICT pSIMD) |
|
{ |
|
return _mm_load_ps( reinterpret_cast<const float *>(pSIMD) ); |
|
} |
|
|
|
// Load 4 unaligned words into a SIMD register |
|
FORCEINLINE i32x4 LoadUnalignedIntSIMD( const void * RESTRICT pSIMD) |
|
{ |
|
return _mm_loadu_ps( reinterpret_cast<const float *>(pSIMD) ); |
|
} |
|
|
|
// save into four words, 16-byte aligned |
|
FORCEINLINE void StoreAlignedIntSIMD( int32 * RESTRICT pSIMD, const fltx4 & a ) |
|
{ |
|
_mm_store_ps( reinterpret_cast<float *>(pSIMD), a ); |
|
} |
|
|
|
FORCEINLINE void StoreAlignedIntSIMD( intx4 &pSIMD, const fltx4 & a ) |
|
{ |
|
_mm_store_ps( reinterpret_cast<float *>(pSIMD.Base()), a ); |
|
} |
|
|
|
FORCEINLINE void StoreUnalignedIntSIMD( int32 * RESTRICT pSIMD, const fltx4 & a ) |
|
{ |
|
_mm_storeu_ps( reinterpret_cast<float *>(pSIMD), a ); |
|
} |
|
|
|
|
|
// CHRISG: the conversion functions all seem to operate on m64's only... |
|
// how do we make them work here? |
|
|
|
// Take a fltx4 containing fixed-point uints and |
|
// return them as single precision floats. No |
|
// fixed point conversion is done. |
|
FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const u32x4 &vSrcA ) |
|
{ |
|
fltx4 retval; |
|
SubFloat( retval, 0 ) = ( (float) SubInt( retval, 0 ) ); |
|
SubFloat( retval, 1 ) = ( (float) SubInt( retval, 1 ) ); |
|
SubFloat( retval, 2 ) = ( (float) SubInt( retval, 2 ) ); |
|
SubFloat( retval, 3 ) = ( (float) SubInt( retval, 3 ) ); |
|
return retval; |
|
} |
|
|
|
|
|
// Take a fltx4 containing fixed-point sints and |
|
// return them as single precision floats. No |
|
// fixed point conversion is done. |
|
FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA ) |
|
{ |
|
fltx4 retval; |
|
SubFloat( retval, 0 ) = ( (float) (reinterpret_cast<const int32 *>(&vSrcA)[0])); |
|
SubFloat( retval, 1 ) = ( (float) (reinterpret_cast<const int32 *>(&vSrcA)[1])); |
|
SubFloat( retval, 2 ) = ( (float) (reinterpret_cast<const int32 *>(&vSrcA)[2])); |
|
SubFloat( retval, 3 ) = ( (float) (reinterpret_cast<const int32 *>(&vSrcA)[3])); |
|
return retval; |
|
} |
|
|
|
/* |
|
works on fltx4's as if they are four uints. |
|
the first parameter contains the words to be shifted, |
|
the second contains the amount to shift by AS INTS |
|
|
|
for i = 0 to 3 |
|
shift = vSrcB_i*32:(i*32)+4 |
|
vReturned_i*32:(i*32)+31 = vSrcA_i*32:(i*32)+31 << shift |
|
*/ |
|
FORCEINLINE i32x4 IntShiftLeftWordSIMD(const i32x4 &vSrcA, const i32x4 &vSrcB) |
|
{ |
|
i32x4 retval; |
|
SubInt(retval, 0) = SubInt(vSrcA, 0) << SubInt(vSrcB, 0); |
|
SubInt(retval, 1) = SubInt(vSrcA, 1) << SubInt(vSrcB, 1); |
|
SubInt(retval, 2) = SubInt(vSrcA, 2) << SubInt(vSrcB, 2); |
|
SubInt(retval, 3) = SubInt(vSrcA, 3) << SubInt(vSrcB, 3); |
|
|
|
|
|
return retval; |
|
} |
|
|
|
|
|
// Fixed-point conversion and save as SIGNED INTS. |
|
// pDest->x = Int (vSrc.x) |
|
// note: some architectures have means of doing |
|
// fixed point conversion when the fix depth is |
|
// specified as an immediate.. but there is no way |
|
// to guarantee an immediate as a parameter to function |
|
// like this. |
|
FORCEINLINE void ConvertStoreAsIntsSIMD(intx4 * RESTRICT pDest, const fltx4 &vSrc) |
|
{ |
|
#if defined( COMPILER_MSVC64 ) |
|
|
|
(*pDest)[0] = SubFloat( vSrc, 0 ); |
|
(*pDest)[1] = SubFloat( vSrc, 1 ); |
|
(*pDest)[2] = SubFloat( vSrc, 2 ); |
|
(*pDest)[3] = SubFloat( vSrc, 3 ); |
|
|
|
#else |
|
__m64 bottom = _mm_cvttps_pi32( vSrc ); |
|
__m64 top = _mm_cvttps_pi32( _mm_movehl_ps(vSrc,vSrc) ); |
|
|
|
*reinterpret_cast<__m64 *>(&(*pDest)[0]) = bottom; |
|
*reinterpret_cast<__m64 *>(&(*pDest)[2]) = top; |
|
|
|
_mm_empty(); |
|
#endif |
|
} |
|
|
|
|
|
|
|
#endif |
|
|
|
|
|
|
|
/// class FourVectors stores 4 independent vectors for use in SIMD processing. These vectors are |
|
/// stored in the format x x x x y y y y z z z z so that they can be efficiently SIMD-accelerated. |
|
class ALIGN16 FourVectors |
|
{ |
|
public: |
|
fltx4 x, y, z; |
|
|
|
FORCEINLINE void DuplicateVector(Vector const &v) //< set all 4 vectors to the same vector value |
|
{ |
|
x=ReplicateX4(v.x); |
|
y=ReplicateX4(v.y); |
|
z=ReplicateX4(v.z); |
|
} |
|
|
|
FORCEINLINE fltx4 const & operator[](int idx) const |
|
{ |
|
return *((&x)+idx); |
|
} |
|
|
|
FORCEINLINE fltx4 & operator[](int idx) |
|
{ |
|
return *((&x)+idx); |
|
} |
|
|
|
FORCEINLINE void operator+=(FourVectors const &b) //< add 4 vectors to another 4 vectors |
|
{ |
|
x=AddSIMD(x,b.x); |
|
y=AddSIMD(y,b.y); |
|
z=AddSIMD(z,b.z); |
|
} |
|
|
|
FORCEINLINE void operator-=(FourVectors const &b) //< subtract 4 vectors from another 4 |
|
{ |
|
x=SubSIMD(x,b.x); |
|
y=SubSIMD(y,b.y); |
|
z=SubSIMD(z,b.z); |
|
} |
|
|
|
FORCEINLINE void operator*=(FourVectors const &b) //< scale all four vectors per component scale |
|
{ |
|
x=MulSIMD(x,b.x); |
|
y=MulSIMD(y,b.y); |
|
z=MulSIMD(z,b.z); |
|
} |
|
|
|
FORCEINLINE void operator*=(const fltx4 & scale) //< scale |
|
{ |
|
x=MulSIMD(x,scale); |
|
y=MulSIMD(y,scale); |
|
z=MulSIMD(z,scale); |
|
} |
|
|
|
FORCEINLINE void operator*=(float scale) //< uniformly scale all 4 vectors |
|
{ |
|
fltx4 scalepacked = ReplicateX4(scale); |
|
*this *= scalepacked; |
|
} |
|
|
|
FORCEINLINE fltx4 operator*(FourVectors const &b) const //< 4 dot products |
|
{ |
|
fltx4 dot=MulSIMD(x,b.x); |
|
dot=MaddSIMD(y,b.y,dot); |
|
dot=MaddSIMD(z,b.z,dot); |
|
return dot; |
|
} |
|
|
|
FORCEINLINE fltx4 operator*(Vector const &b) const //< dot product all 4 vectors with 1 vector |
|
{ |
|
fltx4 dot=MulSIMD(x,ReplicateX4(b.x)); |
|
dot=MaddSIMD(y,ReplicateX4(b.y), dot); |
|
dot=MaddSIMD(z,ReplicateX4(b.z), dot); |
|
return dot; |
|
} |
|
|
|
FORCEINLINE void VProduct(FourVectors const &b) //< component by component mul |
|
{ |
|
x=MulSIMD(x,b.x); |
|
y=MulSIMD(y,b.y); |
|
z=MulSIMD(z,b.z); |
|
} |
|
FORCEINLINE void MakeReciprocal(void) //< (x,y,z)=(1/x,1/y,1/z) |
|
{ |
|
x=ReciprocalSIMD(x); |
|
y=ReciprocalSIMD(y); |
|
z=ReciprocalSIMD(z); |
|
} |
|
|
|
FORCEINLINE void MakeReciprocalSaturate(void) //< (x,y,z)=(1/x,1/y,1/z), 1/0=1.0e23 |
|
{ |
|
x=ReciprocalSaturateSIMD(x); |
|
y=ReciprocalSaturateSIMD(y); |
|
z=ReciprocalSaturateSIMD(z); |
|
} |
|
|
|
// Assume the given matrix is a rotation, and rotate these vectors by it. |
|
// If you have a long list of FourVectors structures that you all want |
|
// to rotate by the same matrix, use FourVectors::RotateManyBy() instead. |
|
inline void RotateBy(const matrix3x4_t& matrix); |
|
|
|
/// You can use this to rotate a long array of FourVectors all by the same |
|
/// matrix. The first parameter is the head of the array. The second is the |
|
/// number of vectors to rotate. The third is the matrix. |
|
static void RotateManyBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix ); |
|
|
|
/// Assume the vectors are points, and transform them in place by the matrix. |
|
inline void TransformBy(const matrix3x4_t& matrix); |
|
|
|
/// You can use this to Transform a long array of FourVectors all by the same |
|
/// matrix. The first parameter is the head of the array. The second is the |
|
/// number of vectors to rotate. The third is the matrix. The fourth is the |
|
/// output buffer, which must not overlap the pVectors buffer. This is not |
|
/// an in-place transformation. |
|
static void TransformManyBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix, FourVectors * RESTRICT pOut ); |
|
|
|
/// You can use this to Transform a long array of FourVectors all by the same |
|
/// matrix. The first parameter is the head of the array. The second is the |
|
/// number of vectors to rotate. The third is the matrix. The fourth is the |
|
/// output buffer, which must not overlap the pVectors buffer. |
|
/// This is an in-place transformation. |
|
static void TransformManyBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix ); |
|
|
|
// X(),Y(),Z() - get at the desired component of the i'th (0..3) vector. |
|
FORCEINLINE const float & X(int idx) const |
|
{ |
|
// NOTE: if the output goes into a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!) |
|
return SubFloat( (fltx4 &)x, idx ); |
|
} |
|
|
|
FORCEINLINE const float & Y(int idx) const |
|
{ |
|
return SubFloat( (fltx4 &)y, idx ); |
|
} |
|
|
|
FORCEINLINE const float & Z(int idx) const |
|
{ |
|
return SubFloat( (fltx4 &)z, idx ); |
|
} |
|
|
|
FORCEINLINE float & X(int idx) |
|
{ |
|
return SubFloat( x, idx ); |
|
} |
|
|
|
FORCEINLINE float & Y(int idx) |
|
{ |
|
return SubFloat( y, idx ); |
|
} |
|
|
|
FORCEINLINE float & Z(int idx) |
|
{ |
|
return SubFloat( z, idx ); |
|
} |
|
|
|
FORCEINLINE Vector Vec(int idx) const //< unpack one of the vectors |
|
{ |
|
return Vector( X(idx), Y(idx), Z(idx) ); |
|
} |
|
|
|
FourVectors(void) = default; |
|
|
|
FourVectors( FourVectors const &src ) |
|
{ |
|
x=src.x; |
|
y=src.y; |
|
z=src.z; |
|
} |
|
|
|
FourVectors( fltx4 x, fltx4 y, fltx4 z ) |
|
{ |
|
this->x=x; |
|
this->y=y; |
|
this->z=z; |
|
} |
|
|
|
FORCEINLINE void operator=( FourVectors const &src ) |
|
{ |
|
x=src.x; |
|
y=src.y; |
|
z=src.z; |
|
} |
|
|
|
/// LoadAndSwizzle - load 4 Vectors into a FourVectors, performing transpose op |
|
FORCEINLINE void LoadAndSwizzle(Vector const &a, Vector const &b, Vector const &c, Vector const &d) |
|
{ |
|
// TransposeSIMD has large sub-expressions that the compiler can't eliminate on x360 |
|
// use an unfolded implementation here |
|
#if _X360 |
|
fltx4 tx = LoadUnalignedSIMD( &a.x ); |
|
fltx4 ty = LoadUnalignedSIMD( &b.x ); |
|
fltx4 tz = LoadUnalignedSIMD( &c.x ); |
|
fltx4 tw = LoadUnalignedSIMD( &d.x ); |
|
fltx4 r0 = __vmrghw(tx, tz); |
|
fltx4 r1 = __vmrghw(ty, tw); |
|
fltx4 r2 = __vmrglw(tx, tz); |
|
fltx4 r3 = __vmrglw(ty, tw); |
|
|
|
x = __vmrghw(r0, r1); |
|
y = __vmrglw(r0, r1); |
|
z = __vmrghw(r2, r3); |
|
#else |
|
x = LoadUnalignedSIMD( &( a.x )); |
|
y = LoadUnalignedSIMD( &( b.x )); |
|
z = LoadUnalignedSIMD( &( c.x )); |
|
fltx4 w = LoadUnalignedSIMD( &( d.x )); |
|
// now, matrix is: |
|
// x y z ? |
|
// x y z ? |
|
// x y z ? |
|
// x y z ? |
|
TransposeSIMD(x, y, z, w); |
|
#endif |
|
} |
|
|
|
/// LoadAndSwizzleAligned - load 4 Vectors into a FourVectors, performing transpose op. |
|
/// all 4 vectors must be 128 bit boundary |
|
FORCEINLINE void LoadAndSwizzleAligned(const float *RESTRICT a, const float *RESTRICT b, const float *RESTRICT c, const float *RESTRICT d) |
|
{ |
|
#if _X360 |
|
fltx4 tx = LoadAlignedSIMD(a); |
|
fltx4 ty = LoadAlignedSIMD(b); |
|
fltx4 tz = LoadAlignedSIMD(c); |
|
fltx4 tw = LoadAlignedSIMD(d); |
|
fltx4 r0 = __vmrghw(tx, tz); |
|
fltx4 r1 = __vmrghw(ty, tw); |
|
fltx4 r2 = __vmrglw(tx, tz); |
|
fltx4 r3 = __vmrglw(ty, tw); |
|
|
|
x = __vmrghw(r0, r1); |
|
y = __vmrglw(r0, r1); |
|
z = __vmrghw(r2, r3); |
|
#else |
|
x = LoadAlignedSIMD( a ); |
|
y = LoadAlignedSIMD( b ); |
|
z = LoadAlignedSIMD( c ); |
|
fltx4 w = LoadAlignedSIMD( d ); |
|
// now, matrix is: |
|
// x y z ? |
|
// x y z ? |
|
// x y z ? |
|
// x y z ? |
|
TransposeSIMD( x, y, z, w ); |
|
#endif |
|
} |
|
|
|
FORCEINLINE void LoadAndSwizzleAligned(Vector const &a, Vector const &b, Vector const &c, Vector const &d) |
|
{ |
|
LoadAndSwizzleAligned( &a.x, &b.x, &c.x, &d.x ); |
|
} |
|
|
|
/// return the squared length of all 4 vectors |
|
FORCEINLINE fltx4 length2(void) const |
|
{ |
|
return (*this)*(*this); |
|
} |
|
|
|
/// return the approximate length of all 4 vectors. uses the sqrt approximation instruction |
|
FORCEINLINE fltx4 length(void) const |
|
{ |
|
return SqrtEstSIMD(length2()); |
|
} |
|
|
|
/// normalize all 4 vectors in place. not mega-accurate (uses reciprocal approximation instruction) |
|
FORCEINLINE void VectorNormalizeFast(void) |
|
{ |
|
fltx4 mag_sq=(*this)*(*this); // length^2 |
|
(*this) *= ReciprocalSqrtEstSIMD(mag_sq); // *(1.0/sqrt(length^2)) |
|
} |
|
|
|
/// normalize all 4 vectors in place. |
|
FORCEINLINE void VectorNormalize(void) |
|
{ |
|
fltx4 mag_sq=(*this)*(*this); // length^2 |
|
(*this) *= ReciprocalSqrtSIMD(mag_sq); // *(1.0/sqrt(length^2)) |
|
} |
|
|
|
/// construct a FourVectors from 4 separate Vectors |
|
FORCEINLINE FourVectors(Vector const &a, Vector const &b, Vector const &c, Vector const &d) |
|
{ |
|
LoadAndSwizzle(a,b,c,d); |
|
} |
|
|
|
/// construct a FourVectors from 4 separate Vectors |
|
FORCEINLINE FourVectors(VectorAligned const &a, VectorAligned const &b, VectorAligned const &c, VectorAligned const &d) |
|
{ |
|
LoadAndSwizzleAligned(a,b,c,d); |
|
} |
|
|
|
FORCEINLINE fltx4 DistToSqr( FourVectors const &pnt ) |
|
{ |
|
fltx4 fl4dX = SubSIMD( pnt.x, x ); |
|
fltx4 fl4dY = SubSIMD( pnt.y, y ); |
|
fltx4 fl4dZ = SubSIMD( pnt.z, z ); |
|
return AddSIMD( MulSIMD( fl4dX, fl4dX), AddSIMD( MulSIMD( fl4dY, fl4dY ), MulSIMD( fl4dZ, fl4dZ ) ) ); |
|
|
|
} |
|
|
|
FORCEINLINE fltx4 TValueOfClosestPointOnLine( FourVectors const &p0, FourVectors const &p1 ) const |
|
{ |
|
FourVectors lineDelta = p1; |
|
lineDelta -= p0; |
|
fltx4 OOlineDirDotlineDir = ReciprocalSIMD( p1 * p1 ); |
|
FourVectors v4OurPnt = *this; |
|
v4OurPnt -= p0; |
|
return MulSIMD( OOlineDirDotlineDir, v4OurPnt * lineDelta ); |
|
} |
|
|
|
FORCEINLINE fltx4 DistSqrToLineSegment( FourVectors const &p0, FourVectors const &p1 ) const |
|
{ |
|
FourVectors lineDelta = p1; |
|
FourVectors v4OurPnt = *this; |
|
v4OurPnt -= p0; |
|
lineDelta -= p0; |
|
|
|
fltx4 OOlineDirDotlineDir = ReciprocalSIMD( lineDelta * lineDelta ); |
|
|
|
fltx4 fl4T = MulSIMD( OOlineDirDotlineDir, v4OurPnt * lineDelta ); |
|
|
|
fl4T = MinSIMD( fl4T, Four_Ones ); |
|
fl4T = MaxSIMD( fl4T, Four_Zeros ); |
|
lineDelta *= fl4T; |
|
return v4OurPnt.DistToSqr( lineDelta ); |
|
} |
|
|
|
}; |
|
|
|
/// form 4 cross products |
|
inline FourVectors operator ^(const FourVectors &a, const FourVectors &b) |
|
{ |
|
FourVectors ret; |
|
ret.x=SubSIMD(MulSIMD(a.y,b.z),MulSIMD(a.z,b.y)); |
|
ret.y=SubSIMD(MulSIMD(a.z,b.x),MulSIMD(a.x,b.z)); |
|
ret.z=SubSIMD(MulSIMD(a.x,b.y),MulSIMD(a.y,b.x)); |
|
return ret; |
|
} |
|
|
|
/// component-by-componentwise MAX operator |
|
inline FourVectors maximum(const FourVectors &a, const FourVectors &b) |
|
{ |
|
FourVectors ret; |
|
ret.x=MaxSIMD(a.x,b.x); |
|
ret.y=MaxSIMD(a.y,b.y); |
|
ret.z=MaxSIMD(a.z,b.z); |
|
return ret; |
|
} |
|
|
|
/// component-by-componentwise MIN operator |
|
inline FourVectors minimum(const FourVectors &a, const FourVectors &b) |
|
{ |
|
FourVectors ret; |
|
ret.x=MinSIMD(a.x,b.x); |
|
ret.y=MinSIMD(a.y,b.y); |
|
ret.z=MinSIMD(a.z,b.z); |
|
return ret; |
|
} |
|
|
|
/// calculate reflection vector. incident and normal dir assumed normalized |
|
FORCEINLINE FourVectors VectorReflect( const FourVectors &incident, const FourVectors &normal ) |
|
{ |
|
FourVectors ret = incident; |
|
fltx4 iDotNx2 = incident * normal; |
|
iDotNx2 = AddSIMD( iDotNx2, iDotNx2 ); |
|
FourVectors nPart = normal; |
|
nPart *= iDotNx2; |
|
ret -= nPart; // i-2(n*i)n |
|
return ret; |
|
} |
|
|
|
/// calculate slide vector. removes all components of a vector which are perpendicular to a normal vector. |
|
FORCEINLINE FourVectors VectorSlide( const FourVectors &incident, const FourVectors &normal ) |
|
{ |
|
FourVectors ret = incident; |
|
fltx4 iDotN = incident * normal; |
|
FourVectors nPart = normal; |
|
nPart *= iDotN; |
|
ret -= nPart; // i-(n*i)n |
|
return ret; |
|
} |
|
|
|
|
|
// Assume the given matrix is a rotation, and rotate these vectors by it. |
|
// If you have a long list of FourVectors structures that you all want |
|
// to rotate by the same matrix, use FourVectors::RotateManyBy() instead. |
|
void FourVectors::RotateBy(const matrix3x4_t& matrix) |
|
{ |
|
// Splat out each of the entries in the matrix to a fltx4. Do this |
|
// in the order that we will need them, to hide latency. I'm |
|
// avoiding making an array of them, so that they'll remain in |
|
// registers. |
|
fltx4 matSplat00, matSplat01, matSplat02, |
|
matSplat10, matSplat11, matSplat12, |
|
matSplat20, matSplat21, matSplat22; |
|
|
|
{ |
|
// Load the matrix into local vectors. Sadly, matrix3x4_ts are |
|
// often unaligned. The w components will be the tranpose row of |
|
// the matrix, but we don't really care about that. |
|
fltx4 matCol0 = LoadUnalignedSIMD( matrix[0] ); |
|
fltx4 matCol1 = LoadUnalignedSIMD( matrix[1] ); |
|
fltx4 matCol2 = LoadUnalignedSIMD( matrix[2] ); |
|
|
|
matSplat00 = SplatXSIMD( matCol0 ); |
|
matSplat01 = SplatYSIMD( matCol0 ); |
|
matSplat02 = SplatZSIMD( matCol0 ); |
|
|
|
matSplat10 = SplatXSIMD( matCol1 ); |
|
matSplat11 = SplatYSIMD( matCol1 ); |
|
matSplat12 = SplatZSIMD( matCol1 ); |
|
|
|
matSplat20 = SplatXSIMD( matCol2 ); |
|
matSplat21 = SplatYSIMD( matCol2 ); |
|
matSplat22 = SplatZSIMD( matCol2 ); |
|
} |
|
|
|
// Trust in the compiler to schedule these operations correctly: |
|
fltx4 outX, outY, outZ; |
|
outX = AddSIMD( AddSIMD( MulSIMD( x, matSplat00 ), MulSIMD( y, matSplat01 ) ), MulSIMD( z, matSplat02 ) ); |
|
outY = AddSIMD( AddSIMD( MulSIMD( x, matSplat10 ), MulSIMD( y, matSplat11 ) ), MulSIMD( z, matSplat12 ) ); |
|
outZ = AddSIMD( AddSIMD( MulSIMD( x, matSplat20 ), MulSIMD( y, matSplat21 ) ), MulSIMD( z, matSplat22 ) ); |
|
|
|
x = outX; |
|
y = outY; |
|
z = outZ; |
|
} |
|
|
|
// Assume the given matrix is a rotation, and rotate these vectors by it. |
|
// If you have a long list of FourVectors structures that you all want |
|
// to rotate by the same matrix, use FourVectors::RotateManyBy() instead. |
|
void FourVectors::TransformBy(const matrix3x4_t& matrix) |
|
{ |
|
// Splat out each of the entries in the matrix to a fltx4. Do this |
|
// in the order that we will need them, to hide latency. I'm |
|
// avoiding making an array of them, so that they'll remain in |
|
// registers. |
|
fltx4 matSplat00, matSplat01, matSplat02, |
|
matSplat10, matSplat11, matSplat12, |
|
matSplat20, matSplat21, matSplat22; |
|
|
|
{ |
|
// Load the matrix into local vectors. Sadly, matrix3x4_ts are |
|
// often unaligned. The w components will be the tranpose row of |
|
// the matrix, but we don't really care about that. |
|
fltx4 matCol0 = LoadUnalignedSIMD( matrix[0] ); |
|
fltx4 matCol1 = LoadUnalignedSIMD( matrix[1] ); |
|
fltx4 matCol2 = LoadUnalignedSIMD( matrix[2] ); |
|
|
|
matSplat00 = SplatXSIMD( matCol0 ); |
|
matSplat01 = SplatYSIMD( matCol0 ); |
|
matSplat02 = SplatZSIMD( matCol0 ); |
|
|
|
matSplat10 = SplatXSIMD( matCol1 ); |
|
matSplat11 = SplatYSIMD( matCol1 ); |
|
matSplat12 = SplatZSIMD( matCol1 ); |
|
|
|
matSplat20 = SplatXSIMD( matCol2 ); |
|
matSplat21 = SplatYSIMD( matCol2 ); |
|
matSplat22 = SplatZSIMD( matCol2 ); |
|
} |
|
|
|
// Trust in the compiler to schedule these operations correctly: |
|
fltx4 outX, outY, outZ; |
|
|
|
outX = MaddSIMD( z, matSplat02, AddSIMD( MulSIMD( x, matSplat00 ), MulSIMD( y, matSplat01 ) ) ); |
|
outY = MaddSIMD( z, matSplat12, AddSIMD( MulSIMD( x, matSplat10 ), MulSIMD( y, matSplat11 ) ) ); |
|
outZ = MaddSIMD( z, matSplat22, AddSIMD( MulSIMD( x, matSplat20 ), MulSIMD( y, matSplat21 ) ) ); |
|
|
|
x = AddSIMD( outX, ReplicateX4( matrix[0][3] )); |
|
y = AddSIMD( outY, ReplicateX4( matrix[1][3] )); |
|
z = AddSIMD( outZ, ReplicateX4( matrix[2][3] )); |
|
} |
|
|
|
|
|
|
|
/// quick, low quality perlin-style noise() function suitable for real time use. |
|
/// return value is -1..1. Only reliable around +/- 1 million or so. |
|
fltx4 NoiseSIMD( const fltx4 & x, const fltx4 & y, const fltx4 & z ); |
|
fltx4 NoiseSIMD( FourVectors const &v ); |
|
|
|
// vector valued noise direction |
|
FourVectors DNoiseSIMD( FourVectors const &v ); |
|
|
|
// vector value "curl" noise function. see http://hyperphysics.phy-astr.gsu.edu/hbase/curl.html |
|
FourVectors CurlNoiseSIMD( FourVectors const &v ); |
|
|
|
|
|
/// calculate the absolute value of a packed single |
|
inline fltx4 fabs( const fltx4 & x ) |
|
{ |
|
return AndSIMD( x, LoadAlignedSIMD( g_SIMD_clear_signmask ) ); |
|
} |
|
|
|
/// negate all four components of a SIMD packed single |
|
inline fltx4 fnegate( const fltx4 & x ) |
|
{ |
|
return XorSIMD( x, LoadAlignedSIMD( g_SIMD_signmask ) ); |
|
} |
|
|
|
|
|
fltx4 Pow_FixedPoint_Exponent_SIMD( const fltx4 & x, int exponent); |
|
|
|
// PowSIMD - raise a SIMD register to a power. This is analogous to the C pow() function, with some |
|
// restictions: fractional exponents are only handled with 2 bits of precision. Basically, |
|
// fractions of 0,.25,.5, and .75 are handled. PowSIMD(x,.30) will be the same as PowSIMD(x,.25). |
|
// negative and fractional powers are handled by the SIMD reciprocal and square root approximation |
|
// instructions and so are not especially accurate ----Note that this routine does not raise |
|
// numeric exceptions because it uses SIMD--- This routine is O(log2(exponent)). |
|
inline fltx4 PowSIMD( const fltx4 & x, float exponent ) |
|
{ |
|
return Pow_FixedPoint_Exponent_SIMD(x,(int) (4.0*exponent)); |
|
} |
|
|
|
|
|
|
|
// random number generation - generate 4 random numbers quickly. |
|
|
|
void SeedRandSIMD(uint32 seed); // seed the random # generator |
|
fltx4 RandSIMD( int nContext = 0 ); // return 4 numbers in the 0..1 range |
|
|
|
// for multithreaded, you need to use these and use the argument form of RandSIMD: |
|
int GetSIMDRandContext( void ); |
|
void ReleaseSIMDRandContext( int nContext ); |
|
|
|
FORCEINLINE fltx4 RandSignedSIMD( void ) // -1..1 |
|
{ |
|
return SubSIMD( MulSIMD( Four_Twos, RandSIMD() ), Four_Ones ); |
|
} |
|
|
|
|
|
// SIMD versions of mathlib simplespline functions |
|
// hermite basis function for smooth interpolation |
|
// Similar to Gain() above, but very cheap to call |
|
// value should be between 0 & 1 inclusive |
|
inline fltx4 SimpleSpline( const fltx4 & value ) |
|
{ |
|
// Arranged to avoid a data dependency between these two MULs: |
|
fltx4 valueDoubled = MulSIMD( value, Four_Twos ); |
|
fltx4 valueSquared = MulSIMD( value, value ); |
|
|
|
// Nice little ease-in, ease-out spline-like curve |
|
return SubSIMD( |
|
MulSIMD( Four_Threes, valueSquared ), |
|
MulSIMD( valueDoubled, valueSquared ) ); |
|
} |
|
|
|
// remaps a value in [startInterval, startInterval+rangeInterval] from linear to |
|
// spline using SimpleSpline |
|
inline fltx4 SimpleSplineRemapValWithDeltas( const fltx4 & val, |
|
const fltx4 & A, const fltx4 & BMinusA, |
|
const fltx4 & OneOverBMinusA, const fltx4 & C, |
|
const fltx4 & DMinusC ) |
|
{ |
|
// if ( A == B ) |
|
// return val >= B ? D : C; |
|
fltx4 cVal = MulSIMD( SubSIMD( val, A), OneOverBMinusA ); |
|
return AddSIMD( C, MulSIMD( DMinusC, SimpleSpline( cVal ) ) ); |
|
} |
|
|
|
inline fltx4 SimpleSplineRemapValWithDeltasClamped( const fltx4 & val, |
|
const fltx4 & A, const fltx4 & BMinusA, |
|
const fltx4 & OneOverBMinusA, const fltx4 & C, |
|
const fltx4 & DMinusC ) |
|
{ |
|
// if ( A == B ) |
|
// return val >= B ? D : C; |
|
fltx4 cVal = MulSIMD( SubSIMD( val, A), OneOverBMinusA ); |
|
cVal = MinSIMD( Four_Ones, MaxSIMD( Four_Zeros, cVal ) ); |
|
return AddSIMD( C, MulSIMD( DMinusC, SimpleSpline( cVal ) ) ); |
|
} |
|
|
|
FORCEINLINE fltx4 FracSIMD( const fltx4 &val ) |
|
{ |
|
fltx4 fl4Abs = fabs( val ); |
|
fltx4 ival = SubSIMD( AddSIMD( fl4Abs, Four_2ToThe23s ), Four_2ToThe23s ); |
|
ival = MaskedAssign( CmpGtSIMD( ival, fl4Abs ), SubSIMD( ival, Four_Ones ), ival ); |
|
return XorSIMD( SubSIMD( fl4Abs, ival ), XorSIMD( val, fl4Abs ) ); // restore sign bits |
|
} |
|
|
|
FORCEINLINE fltx4 Mod2SIMD( const fltx4 &val ) |
|
{ |
|
fltx4 fl4Abs = fabs( val ); |
|
fltx4 ival = SubSIMD( AndSIMD( LoadAlignedSIMD( (float *) g_SIMD_lsbmask ), AddSIMD( fl4Abs, Four_2ToThe23s ) ), Four_2ToThe23s ); |
|
ival = MaskedAssign( CmpGtSIMD( ival, fl4Abs ), SubSIMD( ival, Four_Twos ), ival ); |
|
return XorSIMD( SubSIMD( fl4Abs, ival ), XorSIMD( val, fl4Abs ) ); // restore sign bits |
|
} |
|
|
|
FORCEINLINE fltx4 Mod2SIMDPositiveInput( const fltx4 &val ) |
|
{ |
|
fltx4 ival = SubSIMD( AndSIMD( LoadAlignedSIMD( g_SIMD_lsbmask ), AddSIMD( val, Four_2ToThe23s ) ), Four_2ToThe23s ); |
|
ival = MaskedAssign( CmpGtSIMD( ival, val ), SubSIMD( ival, Four_Twos ), ival ); |
|
return SubSIMD( val, ival ); |
|
} |
|
|
|
|
|
// approximate sin of an angle, with -1..1 representing the whole sin wave period instead of -pi..pi. |
|
// no range reduction is done - for values outside of 0..1 you won't like the results |
|
FORCEINLINE fltx4 _SinEst01SIMD( const fltx4 &val ) |
|
{ |
|
// really rough approximation - x*(4-x*4) - a parabola. s(0) = 0, s(.5) = 1, s(1)=0, smooth in-between. |
|
// sufficient for simple oscillation. |
|
return MulSIMD( val, SubSIMD( Four_Fours, MulSIMD( val, Four_Fours ) ) ); |
|
} |
|
|
|
FORCEINLINE fltx4 _Sin01SIMD( const fltx4 &val ) |
|
{ |
|
// not a bad approximation : parabola always over-estimates. Squared parabola always |
|
// underestimates. So lets blend between them: goodsin = badsin + .225*( badsin^2-badsin) |
|
fltx4 fl4BadEst = MulSIMD( val, SubSIMD( Four_Fours, MulSIMD( val, Four_Fours ) ) ); |
|
return AddSIMD( MulSIMD( Four_Point225s, SubSIMD( MulSIMD( fl4BadEst, fl4BadEst ), fl4BadEst ) ), fl4BadEst ); |
|
} |
|
|
|
// full range useable implementations |
|
FORCEINLINE fltx4 SinEst01SIMD( const fltx4 &val ) |
|
{ |
|
fltx4 fl4Abs = fabs( val ); |
|
fltx4 fl4Reduced2 = Mod2SIMDPositiveInput( fl4Abs ); |
|
fltx4 fl4OddMask = CmpGeSIMD( fl4Reduced2, Four_Ones ); |
|
fltx4 fl4val = SubSIMD( fl4Reduced2, AndSIMD( Four_Ones, fl4OddMask ) ); |
|
fltx4 fl4Sin = _SinEst01SIMD( fl4val ); |
|
fl4Sin = XorSIMD( fl4Sin, AndSIMD( LoadAlignedSIMD( g_SIMD_signmask ), XorSIMD( val, fl4OddMask ) ) ); |
|
return fl4Sin; |
|
|
|
} |
|
|
|
FORCEINLINE fltx4 Sin01SIMD( const fltx4 &val ) |
|
{ |
|
fltx4 fl4Abs = fabs( val ); |
|
fltx4 fl4Reduced2 = Mod2SIMDPositiveInput( fl4Abs ); |
|
fltx4 fl4OddMask = CmpGeSIMD( fl4Reduced2, Four_Ones ); |
|
fltx4 fl4val = SubSIMD( fl4Reduced2, AndSIMD( Four_Ones, fl4OddMask ) ); |
|
fltx4 fl4Sin = _Sin01SIMD( fl4val ); |
|
fl4Sin = XorSIMD( fl4Sin, AndSIMD( LoadAlignedSIMD( g_SIMD_signmask ), XorSIMD( val, fl4OddMask ) ) ); |
|
return fl4Sin; |
|
|
|
} |
|
|
|
// Schlick style Bias approximation see graphics gems 4 : bias(t,a)= t/( (1/a-2)*(1-t)+1) |
|
|
|
FORCEINLINE fltx4 PreCalcBiasParameter( const fltx4 &bias_parameter ) |
|
{ |
|
// convert perlin-style-bias parameter to the value right for the approximation |
|
return SubSIMD( ReciprocalSIMD( bias_parameter ), Four_Twos ); |
|
} |
|
|
|
FORCEINLINE fltx4 BiasSIMD( const fltx4 &val, const fltx4 &precalc_param ) |
|
{ |
|
// similar to bias function except pass precalced bias value from calling PreCalcBiasParameter. |
|
|
|
//!!speed!! use reciprocal est? |
|
//!!speed!! could save one op by precalcing _2_ values |
|
return DivSIMD( val, AddSIMD( MulSIMD( precalc_param, SubSIMD( Four_Ones, val ) ), Four_Ones ) ); |
|
} |
|
|
|
//----------------------------------------------------------------------------- |
|
// Box/plane test |
|
// NOTE: The w component of emins + emaxs must be 1 for this to work |
|
//----------------------------------------------------------------------------- |
|
FORCEINLINE int BoxOnPlaneSideSIMD( const fltx4& emins, const fltx4& emaxs, const cplane_t *p, float tolerance = 0.f ) |
|
{ |
|
fltx4 corners[2]; |
|
fltx4 normal = LoadUnalignedSIMD( p->normal.Base() ); |
|
fltx4 dist = ReplicateX4( -p->dist ); |
|
normal = SetWSIMD( normal, dist ); |
|
fltx4 t4 = ReplicateX4( tolerance ); |
|
fltx4 negt4 = ReplicateX4( -tolerance ); |
|
fltx4 cmp = CmpGeSIMD( normal, Four_Zeros ); |
|
corners[0] = MaskedAssign( cmp, emaxs, emins ); |
|
corners[1] = MaskedAssign( cmp, emins, emaxs ); |
|
fltx4 dot1 = Dot4SIMD( normal, corners[0] ); |
|
fltx4 dot2 = Dot4SIMD( normal, corners[1] ); |
|
cmp = CmpGeSIMD( dot1, t4 ); |
|
fltx4 cmp2 = CmpGtSIMD( negt4, dot2 ); |
|
fltx4 result = MaskedAssign( cmp, Four_Ones, Four_Zeros ); |
|
fltx4 result2 = MaskedAssign( cmp2, Four_Twos, Four_Zeros ); |
|
result = AddSIMD( result, result2 ); |
|
intx4 sides; |
|
ConvertStoreAsIntsSIMD( &sides, result ); |
|
return sides[0]; |
|
} |
|
|
|
#endif // _ssemath_h
|
|
|