You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
367 lines
11 KiB
367 lines
11 KiB
//========= Copyright Valve Corporation, All rights reserved. ============// |
|
// |
|
// Purpose: - defines SIMD "structure of arrays" classes and functions. |
|
// |
|
//===========================================================================// |
|
#ifndef SSEQUATMATH_H |
|
#define SSEQUATMATH_H |
|
|
|
#ifdef _WIN32 |
|
#pragma once |
|
#endif |
|
|
|
|
|
#include "mathlib/ssemath.h" |
|
|
|
// Use this #define to allow SSE versions of Quaternion math |
|
// to exist on PC. |
|
// On PC, certain horizontal vector operations are not supported. |
|
// This causes the SSE implementation of quaternion math to mix the |
|
// vector and scalar floating point units, which is extremely |
|
// performance negative if you don't compile to native SSE2 (which |
|
// we don't as of Sept 1, 2007). So, it's best not to allow these |
|
// functions to exist at all. It's not good enough to simply replace |
|
// the contents of the functions with scalar math, because each call |
|
// to LoadAligned and StoreAligned will result in an unnecssary copy |
|
// of the quaternion, and several moves to and from the XMM registers. |
|
// |
|
// Basically, the problem you run into is that for efficient SIMD code, |
|
// you need to load the quaternions and vectors into SIMD registers and |
|
// keep them there as long as possible while doing only SIMD math, |
|
// whereas for efficient scalar code, each time you copy onto or ever |
|
// use a fltx4, it hoses your pipeline. So the difference has to be |
|
// in the management of temporary variables in the calling function, |
|
// not inside the math functions. |
|
// |
|
// If you compile assuming the presence of SSE2, the MSVC will abandon |
|
// the traditional x87 FPU operations altogether and make everything use |
|
// the SSE2 registers, which lessens this problem a little. |
|
|
|
// permitted only on 360, as we've done careful tuning on its Altivec math: |
|
#ifdef _X360 |
|
#define ALLOW_SIMD_QUATERNION_MATH 1 // not on PC! |
|
#endif |
|
|
|
|
|
|
|
//--------------------------------------------------------------------- |
|
// Load/store quaternions |
|
//--------------------------------------------------------------------- |
|
#ifndef _X360 |
|
#if ALLOW_SIMD_QUATERNION_MATH |
|
// Using STDC or SSE |
|
FORCEINLINE fltx4 LoadAlignedSIMD( const QuaternionAligned & pSIMD ) |
|
{ |
|
fltx4 retval = LoadAlignedSIMD( pSIMD.Base() ); |
|
return retval; |
|
} |
|
|
|
FORCEINLINE fltx4 LoadAlignedSIMD( const QuaternionAligned * RESTRICT pSIMD ) |
|
{ |
|
fltx4 retval = LoadAlignedSIMD( pSIMD ); |
|
return retval; |
|
} |
|
|
|
FORCEINLINE void StoreAlignedSIMD( QuaternionAligned * RESTRICT pSIMD, const fltx4 & a ) |
|
{ |
|
StoreAlignedSIMD( pSIMD->Base(), a ); |
|
} |
|
#endif |
|
#else |
|
|
|
// for the transitional class -- load a QuaternionAligned |
|
FORCEINLINE fltx4 LoadAlignedSIMD( const QuaternionAligned & pSIMD ) |
|
{ |
|
fltx4 retval = XMLoadVector4A( pSIMD.Base() ); |
|
return retval; |
|
} |
|
|
|
FORCEINLINE fltx4 LoadAlignedSIMD( const QuaternionAligned * RESTRICT pSIMD ) |
|
{ |
|
fltx4 retval = XMLoadVector4A( pSIMD ); |
|
return retval; |
|
} |
|
|
|
FORCEINLINE void StoreAlignedSIMD( QuaternionAligned * RESTRICT pSIMD, const fltx4 & a ) |
|
{ |
|
XMStoreVector4A( pSIMD->Base(), a ); |
|
} |
|
|
|
#endif |
|
|
|
|
|
#if ALLOW_SIMD_QUATERNION_MATH |
|
//--------------------------------------------------------------------- |
|
// Make sure quaternions are within 180 degrees of one another, if not, reverse q |
|
//--------------------------------------------------------------------- |
|
FORCEINLINE fltx4 QuaternionAlignSIMD( const fltx4 &p, const fltx4 &q ) |
|
{ |
|
// decide if one of the quaternions is backwards |
|
fltx4 a = SubSIMD( p, q ); |
|
fltx4 b = AddSIMD( p, q ); |
|
a = Dot4SIMD( a, a ); |
|
b = Dot4SIMD( b, b ); |
|
fltx4 cmp = CmpGtSIMD( a, b ); |
|
fltx4 result = MaskedAssign( cmp, NegSIMD(q), q ); |
|
return result; |
|
} |
|
|
|
//--------------------------------------------------------------------- |
|
// Normalize Quaternion |
|
//--------------------------------------------------------------------- |
|
#if USE_STDC_FOR_SIMD |
|
|
|
FORCEINLINE fltx4 QuaternionNormalizeSIMD( const fltx4 &q ) |
|
{ |
|
fltx4 radius, result; |
|
radius = Dot4SIMD( q, q ); |
|
|
|
if ( SubFloat( radius, 0 ) ) // > FLT_EPSILON && ((radius < 1.0f - 4*FLT_EPSILON) || (radius > 1.0f + 4*FLT_EPSILON)) |
|
{ |
|
float iradius = 1.0f / sqrt( SubFloat( radius, 0 ) ); |
|
result = ReplicateX4( iradius ); |
|
result = MulSIMD( result, q ); |
|
return result; |
|
} |
|
return q; |
|
} |
|
|
|
#else |
|
|
|
// SSE + X360 implementation |
|
FORCEINLINE fltx4 QuaternionNormalizeSIMD( const fltx4 &q ) |
|
{ |
|
fltx4 radius, result, mask; |
|
radius = Dot4SIMD( q, q ); |
|
mask = CmpEqSIMD( radius, Four_Zeros ); // all ones iff radius = 0 |
|
result = ReciprocalSqrtSIMD( radius ); |
|
result = MulSIMD( result, q ); |
|
return MaskedAssign( mask, q, result ); // if radius was 0, just return q |
|
} |
|
|
|
#endif |
|
|
|
|
|
//--------------------------------------------------------------------- |
|
// 0.0 returns p, 1.0 return q. |
|
//--------------------------------------------------------------------- |
|
FORCEINLINE fltx4 QuaternionBlendNoAlignSIMD( const fltx4 &p, const fltx4 &q, float t ) |
|
{ |
|
fltx4 sclp, sclq, result; |
|
sclq = ReplicateX4( t ); |
|
sclp = SubSIMD( Four_Ones, sclq ); |
|
result = MulSIMD( sclp, p ); |
|
result = MaddSIMD( sclq, q, result ); |
|
return QuaternionNormalizeSIMD( result ); |
|
} |
|
|
|
|
|
//--------------------------------------------------------------------- |
|
// Blend Quaternions |
|
//--------------------------------------------------------------------- |
|
FORCEINLINE fltx4 QuaternionBlendSIMD( const fltx4 &p, const fltx4 &q, float t ) |
|
{ |
|
// decide if one of the quaternions is backwards |
|
fltx4 q2, result; |
|
q2 = QuaternionAlignSIMD( p, q ); |
|
result = QuaternionBlendNoAlignSIMD( p, q2, t ); |
|
return result; |
|
} |
|
|
|
|
|
//--------------------------------------------------------------------- |
|
// Multiply Quaternions |
|
//--------------------------------------------------------------------- |
|
#ifndef _X360 |
|
|
|
// SSE and STDC |
|
FORCEINLINE fltx4 QuaternionMultSIMD( const fltx4 &p, const fltx4 &q ) |
|
{ |
|
// decide if one of the quaternions is backwards |
|
fltx4 q2, result; |
|
q2 = QuaternionAlignSIMD( p, q ); |
|
SubFloat( result, 0 ) = SubFloat( p, 0 ) * SubFloat( q2, 3 ) + SubFloat( p, 1 ) * SubFloat( q2, 2 ) - SubFloat( p, 2 ) * SubFloat( q2, 1 ) + SubFloat( p, 3 ) * SubFloat( q2, 0 ); |
|
SubFloat( result, 1 ) = -SubFloat( p, 0 ) * SubFloat( q2, 2 ) + SubFloat( p, 1 ) * SubFloat( q2, 3 ) + SubFloat( p, 2 ) * SubFloat( q2, 0 ) + SubFloat( p, 3 ) * SubFloat( q2, 1 ); |
|
SubFloat( result, 2 ) = SubFloat( p, 0 ) * SubFloat( q2, 1 ) - SubFloat( p, 1 ) * SubFloat( q2, 0 ) + SubFloat( p, 2 ) * SubFloat( q2, 3 ) + SubFloat( p, 3 ) * SubFloat( q2, 2 ); |
|
SubFloat( result, 3 ) = -SubFloat( p, 0 ) * SubFloat( q2, 0 ) - SubFloat( p, 1 ) * SubFloat( q2, 1 ) - SubFloat( p, 2 ) * SubFloat( q2, 2 ) + SubFloat( p, 3 ) * SubFloat( q2, 3 ); |
|
return result; |
|
} |
|
|
|
#else |
|
|
|
// X360 |
|
extern const fltx4 g_QuatMultRowSign[4]; |
|
FORCEINLINE fltx4 QuaternionMultSIMD( const fltx4 &p, const fltx4 &q ) |
|
{ |
|
fltx4 q2, row, result; |
|
q2 = QuaternionAlignSIMD( p, q ); |
|
|
|
row = XMVectorSwizzle( q2, 3, 2, 1, 0 ); |
|
row = MulSIMD( row, g_QuatMultRowSign[0] ); |
|
result = Dot4SIMD( row, p ); |
|
|
|
row = XMVectorSwizzle( q2, 2, 3, 0, 1 ); |
|
row = MulSIMD( row, g_QuatMultRowSign[1] ); |
|
row = Dot4SIMD( row, p ); |
|
result = __vrlimi( result, row, 4, 0 ); |
|
|
|
row = XMVectorSwizzle( q2, 1, 0, 3, 2 ); |
|
row = MulSIMD( row, g_QuatMultRowSign[2] ); |
|
row = Dot4SIMD( row, p ); |
|
result = __vrlimi( result, row, 2, 0 ); |
|
|
|
row = MulSIMD( q2, g_QuatMultRowSign[3] ); |
|
row = Dot4SIMD( row, p ); |
|
result = __vrlimi( result, row, 1, 0 ); |
|
return result; |
|
} |
|
|
|
#endif |
|
|
|
|
|
//--------------------------------------------------------------------- |
|
// Quaternion scale |
|
//--------------------------------------------------------------------- |
|
#ifndef _X360 |
|
|
|
// SSE and STDC |
|
FORCEINLINE fltx4 QuaternionScaleSIMD( const fltx4 &p, float t ) |
|
{ |
|
float r; |
|
fltx4 q; |
|
|
|
// FIXME: nick, this isn't overly sensitive to accuracy, and it may be faster to |
|
// use the cos part (w) of the quaternion (sin(omega)*N,cos(omega)) to figure the new scale. |
|
float sinom = sqrt( SubFloat( p, 0 ) * SubFloat( p, 0 ) + SubFloat( p, 1 ) * SubFloat( p, 1 ) + SubFloat( p, 2 ) * SubFloat( p, 2 ) ); |
|
sinom = min( sinom, 1.f ); |
|
|
|
float sinsom = sin( asin( sinom ) * t ); |
|
|
|
t = sinsom / (sinom + FLT_EPSILON); |
|
SubFloat( q, 0 ) = t * SubFloat( p, 0 ); |
|
SubFloat( q, 1 ) = t * SubFloat( p, 1 ); |
|
SubFloat( q, 2 ) = t * SubFloat( p, 2 ); |
|
|
|
// rescale rotation |
|
r = 1.0f - sinsom * sinsom; |
|
|
|
// Assert( r >= 0 ); |
|
if (r < 0.0f) |
|
r = 0.0f; |
|
r = sqrt( r ); |
|
|
|
// keep sign of rotation |
|
SubFloat( q, 3 ) = fsel( SubFloat( p, 3 ), r, -r ); |
|
return q; |
|
} |
|
|
|
#else |
|
|
|
// X360 |
|
FORCEINLINE fltx4 QuaternionScaleSIMD( const fltx4 &p, float t ) |
|
{ |
|
fltx4 sinom = Dot3SIMD( p, p ); |
|
sinom = SqrtSIMD( sinom ); |
|
sinom = MinSIMD( sinom, Four_Ones ); |
|
fltx4 sinsom = ArcSinSIMD( sinom ); |
|
fltx4 t4 = ReplicateX4( t ); |
|
sinsom = MulSIMD( sinsom, t4 ); |
|
sinsom = SinSIMD( sinsom ); |
|
sinom = AddSIMD( sinom, Four_Epsilons ); |
|
sinom = ReciprocalSIMD( sinom ); |
|
t4 = MulSIMD( sinsom, sinom ); |
|
fltx4 result = MulSIMD( p, t4 ); |
|
|
|
// rescale rotation |
|
sinsom = MulSIMD( sinsom, sinsom ); |
|
fltx4 r = SubSIMD( Four_Ones, sinsom ); |
|
r = MaxSIMD( r, Four_Zeros ); |
|
r = SqrtSIMD( r ); |
|
|
|
// keep sign of rotation |
|
fltx4 cmp = CmpGeSIMD( p, Four_Zeros ); |
|
r = MaskedAssign( cmp, r, NegSIMD( r ) ); |
|
|
|
result = __vrlimi(result, r, 1, 0); |
|
return result; |
|
} |
|
|
|
#endif |
|
|
|
|
|
//----------------------------------------------------------------------------- |
|
// Quaternion sphereical linear interpolation |
|
//----------------------------------------------------------------------------- |
|
#ifndef _X360 |
|
|
|
// SSE and STDC |
|
FORCEINLINE fltx4 QuaternionSlerpNoAlignSIMD( const fltx4 &p, const fltx4 &q, float t ) |
|
{ |
|
float omega, cosom, sinom, sclp, sclq; |
|
|
|
fltx4 result; |
|
|
|
// 0.0 returns p, 1.0 return q. |
|
cosom = SubFloat( p, 0 ) * SubFloat( q, 0 ) + SubFloat( p, 1 ) * SubFloat( q, 1 ) + |
|
SubFloat( p, 2 ) * SubFloat( q, 2 ) + SubFloat( p, 3 ) * SubFloat( q, 3 ); |
|
|
|
if ( (1.0f + cosom ) > 0.000001f ) |
|
{ |
|
if ( (1.0f - cosom ) > 0.000001f ) |
|
{ |
|
omega = acos( cosom ); |
|
sinom = sin( omega ); |
|
sclp = sin( (1.0f - t)*omega) / sinom; |
|
sclq = sin( t*omega ) / sinom; |
|
} |
|
else |
|
{ |
|
// TODO: add short circuit for cosom == 1.0f? |
|
sclp = 1.0f - t; |
|
sclq = t; |
|
} |
|
SubFloat( result, 0 ) = sclp * SubFloat( p, 0 ) + sclq * SubFloat( q, 0 ); |
|
SubFloat( result, 1 ) = sclp * SubFloat( p, 1 ) + sclq * SubFloat( q, 1 ); |
|
SubFloat( result, 2 ) = sclp * SubFloat( p, 2 ) + sclq * SubFloat( q, 2 ); |
|
SubFloat( result, 3 ) = sclp * SubFloat( p, 3 ) + sclq * SubFloat( q, 3 ); |
|
} |
|
else |
|
{ |
|
SubFloat( result, 0 ) = -SubFloat( q, 1 ); |
|
SubFloat( result, 1 ) = SubFloat( q, 0 ); |
|
SubFloat( result, 2 ) = -SubFloat( q, 3 ); |
|
SubFloat( result, 3 ) = SubFloat( q, 2 ); |
|
sclp = sin( (1.0f - t) * (0.5f * M_PI)); |
|
sclq = sin( t * (0.5f * M_PI)); |
|
SubFloat( result, 0 ) = sclp * SubFloat( p, 0 ) + sclq * SubFloat( result, 0 ); |
|
SubFloat( result, 1 ) = sclp * SubFloat( p, 1 ) + sclq * SubFloat( result, 1 ); |
|
SubFloat( result, 2 ) = sclp * SubFloat( p, 2 ) + sclq * SubFloat( result, 2 ); |
|
} |
|
|
|
return result; |
|
} |
|
|
|
#else |
|
|
|
// X360 |
|
FORCEINLINE fltx4 QuaternionSlerpNoAlignSIMD( const fltx4 &p, const fltx4 &q, float t ) |
|
{ |
|
return XMQuaternionSlerp( p, q, t ); |
|
} |
|
|
|
#endif |
|
|
|
|
|
FORCEINLINE fltx4 QuaternionSlerpSIMD( const fltx4 &p, const fltx4 &q, float t ) |
|
{ |
|
fltx4 q2, result; |
|
q2 = QuaternionAlignSIMD( p, q ); |
|
result = QuaternionSlerpNoAlignSIMD( p, q2, t ); |
|
return result; |
|
} |
|
|
|
|
|
#endif // ALLOW_SIMD_QUATERNION_MATH |
|
|
|
#endif // SSEQUATMATH_H |
|
|
|
|