You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1129 lines
28 KiB
1129 lines
28 KiB
//========= Copyright Valve Corporation, All rights reserved. ============// |
|
// |
|
// Purpose: SSE Math primitives. |
|
// |
|
//=====================================================================================// |
|
|
|
#include <math.h> |
|
#include <float.h> // Needed for FLT_EPSILON |
|
#include "basetypes.h" |
|
#include <memory.h> |
|
#include "tier0/dbg.h" |
|
#include "mathlib/mathlib.h" |
|
#include "mathlib/vector.h" |
|
#if defined(__arm__) || defined(__arm64__) |
|
#include "sse2neon.h" |
|
#endif |
|
|
|
#include "sse.h" |
|
|
|
// memdbgon must be the last include file in a .cpp file!!! |
|
#include "tier0/memdbgon.h" |
|
|
|
#ifndef COMPILER_MSVC64 |
|
// Implement for 64-bit Windows if needed. |
|
|
|
static const uint32 _sincos_masks[] = { (uint32)0x0, (uint32)~0x0 }; |
|
static const uint32 _sincos_inv_masks[] = { (uint32)~0x0, (uint32)0x0 }; |
|
|
|
//----------------------------------------------------------------------------- |
|
// Macros and constants required by some of the SSE assembly: |
|
//----------------------------------------------------------------------------- |
|
|
|
#ifdef _WIN32 |
|
#define _PS_EXTERN_CONST(Name, Val) \ |
|
const __declspec(align(16)) float _ps_##Name[4] = { Val, Val, Val, Val } |
|
|
|
#define _PS_EXTERN_CONST_TYPE(Name, Type, Val) \ |
|
const __declspec(align(16)) Type _ps_##Name[4] = { Val, Val, Val, Val }; \ |
|
|
|
#define _EPI32_CONST(Name, Val) \ |
|
static const __declspec(align(16)) __int32 _epi32_##Name[4] = { Val, Val, Val, Val } |
|
|
|
#define _PS_CONST(Name, Val) \ |
|
static const __declspec(align(16)) float _ps_##Name[4] = { Val, Val, Val, Val } |
|
#elif POSIX |
|
#define _PS_EXTERN_CONST(Name, Val) \ |
|
const float _ps_##Name[4] __attribute__((aligned(16))) = { Val, Val, Val, Val } |
|
|
|
#define _PS_EXTERN_CONST_TYPE(Name, Type, Val) \ |
|
const Type _ps_##Name[4] __attribute__((aligned(16))) = { Val, Val, Val, Val }; \ |
|
|
|
#define _EPI32_CONST(Name, Val) \ |
|
static const int32 _epi32_##Name[4] __attribute__((aligned(16))) = { Val, Val, Val, Val } |
|
|
|
#define _PS_CONST(Name, Val) \ |
|
static const float _ps_##Name[4] __attribute__((aligned(16))) = { Val, Val, Val, Val } |
|
#endif |
|
|
|
_PS_EXTERN_CONST(am_0, 0.0f); |
|
_PS_EXTERN_CONST(am_1, 1.0f); |
|
_PS_EXTERN_CONST(am_m1, -1.0f); |
|
_PS_EXTERN_CONST(am_0p5, 0.5f); |
|
_PS_EXTERN_CONST(am_1p5, 1.5f); |
|
_PS_EXTERN_CONST(am_pi, (float)M_PI); |
|
_PS_EXTERN_CONST(am_pi_o_2, (float)(M_PI / 2.0)); |
|
_PS_EXTERN_CONST(am_2_o_pi, (float)(2.0 / M_PI)); |
|
_PS_EXTERN_CONST(am_pi_o_4, (float)(M_PI / 4.0)); |
|
_PS_EXTERN_CONST(am_4_o_pi, (float)(4.0 / M_PI)); |
|
_PS_EXTERN_CONST_TYPE(am_sign_mask, uint32, 0x80000000); |
|
_PS_EXTERN_CONST_TYPE(am_inv_sign_mask, uint32, ~0x80000000); |
|
_PS_EXTERN_CONST_TYPE(am_min_norm_pos,uint32, 0x00800000); |
|
_PS_EXTERN_CONST_TYPE(am_mant_mask, uint32, 0x7f800000); |
|
_PS_EXTERN_CONST_TYPE(am_inv_mant_mask, int32, ~0x7f800000); |
|
|
|
_EPI32_CONST(1, 1); |
|
_EPI32_CONST(2, 2); |
|
|
|
_PS_CONST(sincos_p0, 0.15707963267948963959e1f); |
|
_PS_CONST(sincos_p1, -0.64596409750621907082e0f); |
|
_PS_CONST(sincos_p2, 0.7969262624561800806e-1f); |
|
_PS_CONST(sincos_p3, -0.468175413106023168e-2f); |
|
|
|
#ifdef PFN_VECTORMA |
|
void __cdecl _SSE_VectorMA( const float *start, float scale, const float *direction, float *dest ); |
|
#endif |
|
|
|
//----------------------------------------------------------------------------- |
|
// SSE implementations of optimized routines: |
|
//----------------------------------------------------------------------------- |
|
float _SSE_Sqrt(float x) |
|
{ |
|
Assert( s_bMathlibInitialized ); |
|
float root = 0.f; |
|
#ifdef _WIN32 |
|
_asm |
|
{ |
|
sqrtss xmm0, x |
|
movss root, xmm0 |
|
} |
|
#elif POSIX |
|
_mm_store_ss( &root, _mm_sqrt_ss( _mm_load_ss( &x ) ) ); |
|
#endif |
|
return root; |
|
} |
|
|
|
// Single iteration NewtonRaphson reciprocal square root: |
|
// 0.5 * rsqrtps * (3 - x * rsqrtps(x) * rsqrtps(x)) |
|
// Very low error, and fine to use in place of 1.f / sqrtf(x). |
|
#if 0 |
|
float _SSE_RSqrtAccurate(float x) |
|
{ |
|
Assert( s_bMathlibInitialized ); |
|
|
|
float rroot; |
|
_asm |
|
{ |
|
rsqrtss xmm0, x |
|
movss rroot, xmm0 |
|
} |
|
|
|
return (0.5f * rroot) * (3.f - (x * rroot) * rroot); |
|
} |
|
#else |
|
|
|
#ifdef POSIX |
|
const __m128 f3 = _mm_set_ss(3.0f); // 3 as SSE value |
|
const __m128 f05 = _mm_set_ss(0.5f); // 0.5 as SSE value |
|
#endif |
|
|
|
// Intel / Kipps SSE RSqrt. Significantly faster than above. |
|
float _SSE_RSqrtAccurate(float a) |
|
{ |
|
|
|
#ifdef _WIN32 |
|
float x; |
|
float half = 0.5f; |
|
float three = 3.f; |
|
|
|
__asm |
|
{ |
|
movss xmm3, a; |
|
movss xmm1, half; |
|
movss xmm2, three; |
|
rsqrtss xmm0, xmm3; |
|
|
|
mulss xmm3, xmm0; |
|
mulss xmm1, xmm0; |
|
mulss xmm3, xmm0; |
|
subss xmm2, xmm3; |
|
mulss xmm1, xmm2; |
|
|
|
movss x, xmm1; |
|
} |
|
|
|
return x; |
|
#elif POSIX |
|
__m128 xx = _mm_load_ss( &a ); |
|
__m128 xr = _mm_rsqrt_ss( xx ); |
|
__m128 xt; |
|
|
|
xt = _mm_mul_ss( xr, xr ); |
|
xt = _mm_mul_ss( xt, xx ); |
|
xt = _mm_sub_ss( f3, xt ); |
|
xt = _mm_mul_ss( xt, f05 ); |
|
xr = _mm_mul_ss( xr, xt ); |
|
|
|
_mm_store_ss( &a, xr ); |
|
return a; |
|
#else |
|
#error "Not Implemented" |
|
#endif |
|
|
|
} |
|
#endif |
|
|
|
// Simple SSE rsqrt. Usually accurate to around 6 (relative) decimal places |
|
// or so, so ok for closed transforms. (ie, computing lighting normals) |
|
float _SSE_RSqrtFast(float x) |
|
{ |
|
Assert( s_bMathlibInitialized ); |
|
|
|
float rroot; |
|
#if defined(__arm__) || defined(__arm64__) |
|
rroot = _SSE_RSqrtAccurate(x); |
|
#elif _WIN32 |
|
_asm |
|
{ |
|
rsqrtss xmm0, x |
|
movss rroot, xmm0 |
|
} |
|
#elif POSIX |
|
__asm__ __volatile__( "rsqrtss %0, %1" : "=x" (rroot) : "x" (x) ); |
|
#else |
|
#error |
|
#endif |
|
|
|
return rroot; |
|
} |
|
|
|
float FASTCALL _SSE_VectorNormalize (Vector& vec) |
|
{ |
|
Assert( s_bMathlibInitialized ); |
|
|
|
// NOTE: This is necessary to prevent an memory overwrite... |
|
// sice vec only has 3 floats, we can't "movaps" directly into it. |
|
#ifdef _WIN32 |
|
__declspec(align(16)) float result[4]; |
|
#elif POSIX |
|
float result[4] __attribute__((aligned(16))); |
|
#endif |
|
|
|
float *v = &vec[0]; |
|
float *r = &result[0]; |
|
|
|
float radius = 0.f; |
|
// Blah, get rid of these comparisons ... in reality, if you have all 3 as zero, it shouldn't |
|
// be much of a performance win, considering you will very likely miss 3 branch predicts in a row. |
|
if ( v[0] || v[1] || v[2] ) |
|
{ |
|
#if defined(__arm__) || defined(__arm64__) |
|
float rsqrt = _SSE_RSqrtAccurate( v[0] * v[0] + v[1] * v[1] + v[2] * v[2] ); |
|
r[0] = v[0] * rsqrt; |
|
r[1] = v[1] * rsqrt; |
|
r[2] = v[2] * rsqrt; |
|
#elif _WIN32 |
|
_asm |
|
{ |
|
mov eax, v |
|
mov edx, r |
|
#ifdef ALIGNED_VECTOR |
|
movaps xmm4, [eax] // r4 = vx, vy, vz, X |
|
movaps xmm1, xmm4 // r1 = r4 |
|
#else |
|
movups xmm4, [eax] // r4 = vx, vy, vz, X |
|
movaps xmm1, xmm4 // r1 = r4 |
|
#endif |
|
mulps xmm1, xmm4 // r1 = vx * vx, vy * vy, vz * vz, X |
|
movhlps xmm3, xmm1 // r3 = vz * vz, X, X, X |
|
movaps xmm2, xmm1 // r2 = r1 |
|
shufps xmm2, xmm2, 1 // r2 = vy * vy, X, X, X |
|
addss xmm1, xmm2 // r1 = (vx * vx) + (vy * vy), X, X, X |
|
addss xmm1, xmm3 // r1 = (vx * vx) + (vy * vy) + (vz * vz), X, X, X |
|
sqrtss xmm1, xmm1 // r1 = sqrt((vx * vx) + (vy * vy) + (vz * vz)), X, X, X |
|
movss radius, xmm1 // radius = sqrt((vx * vx) + (vy * vy) + (vz * vz)) |
|
rcpss xmm1, xmm1 // r1 = 1/radius, X, X, X |
|
shufps xmm1, xmm1, 0 // r1 = 1/radius, 1/radius, 1/radius, X |
|
mulps xmm4, xmm1 // r4 = vx * 1/radius, vy * 1/radius, vz * 1/radius, X |
|
movaps [edx], xmm4 // v = vx * 1/radius, vy * 1/radius, vz * 1/radius, X |
|
} |
|
#elif POSIX |
|
__asm__ __volatile__( |
|
#ifdef ALIGNED_VECTOR |
|
"movaps %2, %%xmm4 \n\t" |
|
"movaps %%xmm4, %%xmm1 \n\t" |
|
#else |
|
"movups %2, %%xmm4 \n\t" |
|
"movaps %%xmm4, %%xmm1 \n\t" |
|
#endif |
|
"mulps %%xmm4, %%xmm1 \n\t" |
|
"movhlps %%xmm1, %%xmm3 \n\t" |
|
"movaps %%xmm1, %%xmm2 \n\t" |
|
"shufps $1, %%xmm2, %%xmm2 \n\t" |
|
"addss %%xmm2, %%xmm1 \n\t" |
|
"addss %%xmm3, %%xmm1 \n\t" |
|
"sqrtss %%xmm1, %%xmm1 \n\t" |
|
"movss %%xmm1, %0 \n\t" |
|
"rcpss %%xmm1, %%xmm1 \n\t" |
|
"shufps $0, %%xmm1, %%xmm1 \n\t" |
|
"mulps %%xmm1, %%xmm4 \n\t" |
|
"movaps %%xmm4, %1 \n\t" |
|
: "=m" (radius), "=m" (result) |
|
: "m" (*v) |
|
: "xmm1", "xmm2", "xmm3", "xmm4" |
|
); |
|
#else |
|
#error "Not Implemented" |
|
#endif |
|
vec.x = result[0]; |
|
vec.y = result[1]; |
|
vec.z = result[2]; |
|
|
|
} |
|
|
|
return radius; |
|
} |
|
|
|
void FASTCALL _SSE_VectorNormalizeFast (Vector& vec) |
|
{ |
|
float ool = _SSE_RSqrtAccurate( FLT_EPSILON + vec.x * vec.x + vec.y * vec.y + vec.z * vec.z ); |
|
|
|
vec.x *= ool; |
|
vec.y *= ool; |
|
vec.z *= ool; |
|
} |
|
|
|
float _SSE_InvRSquared(const float* v) |
|
{ |
|
float inv_r2 = 1.f; |
|
#if defined(__arm__) || defined(__arm64__) |
|
return _SSE_RSqrtAccurate( FLT_EPSILON + v[0] * v[0] + v[1] * v[1] + v[2] * v[2] ); |
|
#elif _WIN32 |
|
_asm { // Intel SSE only routine |
|
mov eax, v |
|
movss xmm5, inv_r2 // x5 = 1.0, 0, 0, 0 |
|
#ifdef ALIGNED_VECTOR |
|
movaps xmm4, [eax] // x4 = vx, vy, vz, X |
|
#else |
|
movups xmm4, [eax] // x4 = vx, vy, vz, X |
|
#endif |
|
movaps xmm1, xmm4 // x1 = x4 |
|
mulps xmm1, xmm4 // x1 = vx * vx, vy * vy, vz * vz, X |
|
movhlps xmm3, xmm1 // x3 = vz * vz, X, X, X |
|
movaps xmm2, xmm1 // x2 = x1 |
|
shufps xmm2, xmm2, 1 // x2 = vy * vy, X, X, X |
|
addss xmm1, xmm2 // x1 = (vx * vx) + (vy * vy), X, X, X |
|
addss xmm1, xmm3 // x1 = (vx * vx) + (vy * vy) + (vz * vz), X, X, X |
|
maxss xmm1, xmm5 // x1 = max( 1.0, x1 ) |
|
rcpss xmm0, xmm1 // x0 = 1 / max( 1.0, x1 ) |
|
movss inv_r2, xmm0 // inv_r2 = x0 |
|
} |
|
#elif POSIX |
|
__asm__ __volatile__( |
|
"movss %0, %%xmm5 \n\t" |
|
#ifdef ALIGNED_VECTOR |
|
"movaps %1, %%xmm4 \n\t" |
|
#else |
|
"movups %1, %%xmm4 \n\t" |
|
#endif |
|
"movaps %%xmm4, %%xmm1 \n\t" |
|
"mulps %%xmm4, %%xmm1 \n\t" |
|
"movhlps %%xmm1, %%xmm3 \n\t" |
|
"movaps %%xmm1, %%xmm2 \n\t" |
|
"shufps $1, %%xmm2, %%xmm2 \n\t" |
|
"addss %%xmm2, %%xmm1 \n\t" |
|
"addss %%xmm3, %%xmm1 \n\t" |
|
"maxss %%xmm5, %%xmm1 \n\t" |
|
"rcpss %%xmm1, %%xmm0 \n\t" |
|
"movss %%xmm0, %0 \n\t" |
|
: "+m" (inv_r2) |
|
: "m" (*v) |
|
: "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
|
); |
|
#else |
|
#error "Not Implemented" |
|
#endif |
|
|
|
return inv_r2; |
|
} |
|
|
|
|
|
#ifdef POSIX |
|
// #define _PS_CONST(Name, Val) static const ALIGN16 float _ps_##Name[4] ALIGN16_POST = { Val, Val, Val, Val } |
|
#define _PS_CONST_TYPE(Name, Type, Val) static const ALIGN16 Type _ps_##Name[4] ALIGN16_POST = { Val, Val, Val, Val } |
|
|
|
_PS_CONST_TYPE(sign_mask, int, (int)0x80000000); |
|
_PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000); |
|
|
|
|
|
#define _PI32_CONST(Name, Val) static const ALIGN16 int _pi32_##Name[4] ALIGN16_POST = { Val, Val, Val, Val } |
|
|
|
_PI32_CONST(1, 1); |
|
_PI32_CONST(inv1, ~1); |
|
_PI32_CONST(2, 2); |
|
_PI32_CONST(4, 4); |
|
_PI32_CONST(0x7f, 0x7f); |
|
_PS_CONST(1 , 1.0f); |
|
_PS_CONST(0p5, 0.5f); |
|
|
|
_PS_CONST(minus_cephes_DP1, -0.78515625); |
|
_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); |
|
_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8); |
|
_PS_CONST(sincof_p0, -1.9515295891E-4); |
|
_PS_CONST(sincof_p1, 8.3321608736E-3); |
|
_PS_CONST(sincof_p2, -1.6666654611E-1); |
|
_PS_CONST(coscof_p0, 2.443315711809948E-005); |
|
_PS_CONST(coscof_p1, -1.388731625493765E-003); |
|
_PS_CONST(coscof_p2, 4.166664568298827E-002); |
|
_PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI |
|
|
|
typedef union xmm_mm_union { |
|
__m128 xmm; |
|
__m64 mm[2]; |
|
} xmm_mm_union; |
|
|
|
#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; } |
|
|
|
typedef __m128 v4sf; // vector of 4 float (sse1) |
|
typedef __m64 v2si; // vector of 2 int (mmx) |
|
|
|
#endif |
|
|
|
void _SSE_SinCos(float x, float* s, float* c) |
|
{ |
|
#if defined(__arm__) || defined(__arm64__) |
|
#if defined( OSX ) |
|
__sincosf(x, s, c); |
|
#elif defined( POSIX ) |
|
sincosf(x, s, c); |
|
#else |
|
*s = sin( x ); |
|
*c = cos( x ); |
|
#endif |
|
#elif _WIN32 |
|
float t4, t8, t12; |
|
|
|
__asm |
|
{ |
|
movss xmm0, x |
|
movss t12, xmm0 |
|
movss xmm1, _ps_am_inv_sign_mask |
|
mov eax, t12 |
|
mulss xmm0, _ps_am_2_o_pi |
|
andps xmm0, xmm1 |
|
and eax, 0x80000000 |
|
|
|
cvttss2si edx, xmm0 |
|
mov ecx, edx |
|
mov t12, esi |
|
mov esi, edx |
|
add edx, 0x1 |
|
shl ecx, (31 - 1) |
|
shl edx, (31 - 1) |
|
|
|
movss xmm4, _ps_am_1 |
|
cvtsi2ss xmm3, esi |
|
mov t8, eax |
|
and esi, 0x1 |
|
|
|
subss xmm0, xmm3 |
|
movss xmm3, _sincos_inv_masks[esi * 4] |
|
minss xmm0, xmm4 |
|
|
|
subss xmm4, xmm0 |
|
|
|
movss xmm6, xmm4 |
|
andps xmm4, xmm3 |
|
and ecx, 0x80000000 |
|
movss xmm2, xmm3 |
|
andnps xmm3, xmm0 |
|
and edx, 0x80000000 |
|
movss xmm7, t8 |
|
andps xmm0, xmm2 |
|
mov t8, ecx |
|
mov t4, edx |
|
orps xmm4, xmm3 |
|
|
|
mov eax, s //mov eax, [esp + 4 + 16] |
|
mov edx, c //mov edx, [esp + 4 + 16 + 4] |
|
|
|
andnps xmm2, xmm6 |
|
orps xmm0, xmm2 |
|
|
|
movss xmm2, t8 |
|
movss xmm1, xmm0 |
|
movss xmm5, xmm4 |
|
xorps xmm7, xmm2 |
|
movss xmm3, _ps_sincos_p3 |
|
mulss xmm0, xmm0 |
|
mulss xmm4, xmm4 |
|
movss xmm2, xmm0 |
|
movss xmm6, xmm4 |
|
orps xmm1, xmm7 |
|
movss xmm7, _ps_sincos_p2 |
|
mulss xmm0, xmm3 |
|
mulss xmm4, xmm3 |
|
movss xmm3, _ps_sincos_p1 |
|
addss xmm0, xmm7 |
|
addss xmm4, xmm7 |
|
movss xmm7, _ps_sincos_p0 |
|
mulss xmm0, xmm2 |
|
mulss xmm4, xmm6 |
|
addss xmm0, xmm3 |
|
addss xmm4, xmm3 |
|
movss xmm3, t4 |
|
mulss xmm0, xmm2 |
|
mulss xmm4, xmm6 |
|
orps xmm5, xmm3 |
|
mov esi, t12 |
|
addss xmm0, xmm7 |
|
addss xmm4, xmm7 |
|
mulss xmm0, xmm1 |
|
mulss xmm4, xmm5 |
|
|
|
// use full stores since caller might reload with full loads |
|
movss [eax], xmm0 |
|
movss [edx], xmm4 |
|
} |
|
#elif POSIX |
|
|
|
Assert( "Needs testing, verify impl!\n" ); |
|
|
|
v4sf xx = _mm_load_ss( &x ); |
|
|
|
v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y; |
|
v2si mm0, mm1, mm2, mm3, mm4, mm5; |
|
sign_bit_sin = xx; |
|
/* take the absolute value */ |
|
xx = _mm_and_ps(xx, *(v4sf*)_ps_inv_sign_mask); |
|
/* extract the sign bit (upper one) */ |
|
sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask); |
|
|
|
/* scale by 4/Pi */ |
|
y = _mm_mul_ps(xx, *(v4sf*)_ps_cephes_FOPI); |
|
|
|
/* store the integer part of y in mm2:mm3 */ |
|
xmm3 = _mm_movehl_ps(xmm3, y); |
|
mm2 = _mm_cvttps_pi32(y); |
|
mm3 = _mm_cvttps_pi32(xmm3); |
|
|
|
/* j=(j+1) & (~1) (see the cephes sources) */ |
|
mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1); |
|
mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1); |
|
mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1); |
|
mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1); |
|
|
|
y = _mm_cvtpi32x2_ps(mm2, mm3); |
|
|
|
mm4 = mm2; |
|
mm5 = mm3; |
|
|
|
/* get the swap sign flag for the sine */ |
|
mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4); |
|
mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4); |
|
mm0 = _mm_slli_pi32(mm0, 29); |
|
mm1 = _mm_slli_pi32(mm1, 29); |
|
v4sf swap_sign_bit_sin; |
|
COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin); |
|
|
|
/* get the polynom selection mask for the sine */ |
|
|
|
mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2); |
|
mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2); |
|
mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64()); |
|
mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64()); |
|
v4sf poly_mask; |
|
COPY_MM_TO_XMM(mm2, mm3, poly_mask); |
|
|
|
/* The magic pass: "Extended precision modular arithmetic" |
|
x = ((x - y * DP1) - y * DP2) - y * DP3; */ |
|
xmm1 = *(v4sf*)_ps_minus_cephes_DP1; |
|
xmm2 = *(v4sf*)_ps_minus_cephes_DP2; |
|
xmm3 = *(v4sf*)_ps_minus_cephes_DP3; |
|
xmm1 = _mm_mul_ps(y, xmm1); |
|
xmm2 = _mm_mul_ps(y, xmm2); |
|
xmm3 = _mm_mul_ps(y, xmm3); |
|
xx = _mm_add_ps(xx, xmm1); |
|
xx = _mm_add_ps(xx, xmm2); |
|
xx = _mm_add_ps(xx, xmm3); |
|
|
|
/* get the sign flag for the cosine */ |
|
mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2); |
|
mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2); |
|
mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4); |
|
mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4); |
|
mm4 = _mm_slli_pi32(mm4, 29); |
|
mm5 = _mm_slli_pi32(mm5, 29); |
|
v4sf sign_bit_cos; |
|
COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos); |
|
_mm_empty(); /* good-bye mmx */ |
|
|
|
sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); |
|
|
|
|
|
/* Evaluate the first polynom (0 <= x <= Pi/4) */ |
|
v4sf z = _mm_mul_ps(xx,xx); |
|
y = *(v4sf*)_ps_coscof_p0; |
|
|
|
y = _mm_mul_ps(y, z); |
|
y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1); |
|
y = _mm_mul_ps(y, z); |
|
y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2); |
|
y = _mm_mul_ps(y, z); |
|
y = _mm_mul_ps(y, z); |
|
v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5); |
|
y = _mm_sub_ps(y, tmp); |
|
y = _mm_add_ps(y, *(v4sf*)_ps_1); |
|
|
|
/* Evaluate the second polynom (Pi/4 <= x <= 0) */ |
|
|
|
v4sf y2 = *(v4sf*)_ps_sincof_p0; |
|
y2 = _mm_mul_ps(y2, z); |
|
y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1); |
|
y2 = _mm_mul_ps(y2, z); |
|
y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2); |
|
y2 = _mm_mul_ps(y2, z); |
|
y2 = _mm_mul_ps(y2, xx); |
|
y2 = _mm_add_ps(y2, xx); |
|
|
|
/* select the correct result from the two polynoms */ |
|
xmm3 = poly_mask; |
|
v4sf ysin2 = _mm_and_ps(xmm3, y2); |
|
v4sf ysin1 = _mm_andnot_ps(xmm3, y); |
|
y2 = _mm_sub_ps(y2,ysin2); |
|
y = _mm_sub_ps(y, ysin1); |
|
|
|
xmm1 = _mm_add_ps(ysin1,ysin2); |
|
xmm2 = _mm_add_ps(y,y2); |
|
|
|
/* update the sign */ |
|
_mm_store_ss( s, _mm_xor_ps(xmm1, sign_bit_sin) ); |
|
_mm_store_ss( c, _mm_xor_ps(xmm2, sign_bit_cos) ); |
|
|
|
#else |
|
#error "Not Implemented" |
|
#endif |
|
} |
|
|
|
float _SSE_cos( float x ) |
|
{ |
|
#if defined(__arm__) || defined(__arm64__) |
|
return cos(x); |
|
#elif _WIN32 |
|
float temp; |
|
__asm |
|
{ |
|
movss xmm0, x |
|
movss xmm1, _ps_am_inv_sign_mask |
|
andps xmm0, xmm1 |
|
addss xmm0, _ps_am_pi_o_2 |
|
mulss xmm0, _ps_am_2_o_pi |
|
|
|
cvttss2si ecx, xmm0 |
|
movss xmm5, _ps_am_1 |
|
mov edx, ecx |
|
shl edx, (31 - 1) |
|
cvtsi2ss xmm1, ecx |
|
and edx, 0x80000000 |
|
and ecx, 0x1 |
|
|
|
subss xmm0, xmm1 |
|
movss xmm6, _sincos_masks[ecx * 4] |
|
minss xmm0, xmm5 |
|
|
|
movss xmm1, _ps_sincos_p3 |
|
subss xmm5, xmm0 |
|
|
|
andps xmm5, xmm6 |
|
movss xmm7, _ps_sincos_p2 |
|
andnps xmm6, xmm0 |
|
mov temp, edx |
|
orps xmm5, xmm6 |
|
movss xmm0, xmm5 |
|
|
|
mulss xmm5, xmm5 |
|
movss xmm4, _ps_sincos_p1 |
|
movss xmm2, xmm5 |
|
mulss xmm5, xmm1 |
|
movss xmm1, _ps_sincos_p0 |
|
addss xmm5, xmm7 |
|
mulss xmm5, xmm2 |
|
movss xmm3, temp |
|
addss xmm5, xmm4 |
|
mulss xmm5, xmm2 |
|
orps xmm0, xmm3 |
|
addss xmm5, xmm1 |
|
mulss xmm0, xmm5 |
|
|
|
movss x, xmm0 |
|
|
|
} |
|
#elif POSIX |
|
|
|
Assert( "Needs testing, verify impl!\n" ); |
|
|
|
v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y; |
|
v2si mm0, mm1, mm2, mm3; |
|
/* take the absolute value */ |
|
v4sf xx = _mm_load_ss( &x ); |
|
|
|
xx = _mm_and_ps(xx, *(v4sf*)_ps_inv_sign_mask); |
|
|
|
/* scale by 4/Pi */ |
|
y = _mm_mul_ps(xx, *(v4sf*)_ps_cephes_FOPI); |
|
|
|
/* store the integer part of y in mm0:mm1 */ |
|
xmm2 = _mm_movehl_ps(xmm2, y); |
|
mm2 = _mm_cvttps_pi32(y); |
|
mm3 = _mm_cvttps_pi32(xmm2); |
|
|
|
/* j=(j+1) & (~1) (see the cephes sources) */ |
|
mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1); |
|
mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1); |
|
mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1); |
|
mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1); |
|
|
|
y = _mm_cvtpi32x2_ps(mm2, mm3); |
|
|
|
|
|
mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2); |
|
mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2); |
|
|
|
/* get the swap sign flag in mm0:mm1 and the |
|
polynom selection mask in mm2:mm3 */ |
|
|
|
mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4); |
|
mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4); |
|
mm0 = _mm_slli_pi32(mm0, 29); |
|
mm1 = _mm_slli_pi32(mm1, 29); |
|
|
|
mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2); |
|
mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2); |
|
|
|
mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64()); |
|
mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64()); |
|
|
|
v4sf sign_bit, poly_mask; |
|
COPY_MM_TO_XMM(mm0, mm1, sign_bit); |
|
COPY_MM_TO_XMM(mm2, mm3, poly_mask); |
|
_mm_empty(); /* good-bye mmx */ |
|
|
|
/* The magic pass: "Extended precision modular arithmetic" |
|
x = ((x - y * DP1) - y * DP2) - y * DP3; */ |
|
xmm1 = *(v4sf*)_ps_minus_cephes_DP1; |
|
xmm2 = *(v4sf*)_ps_minus_cephes_DP2; |
|
xmm3 = *(v4sf*)_ps_minus_cephes_DP3; |
|
xmm1 = _mm_mul_ps(y, xmm1); |
|
xmm2 = _mm_mul_ps(y, xmm2); |
|
xmm3 = _mm_mul_ps(y, xmm3); |
|
xx = _mm_add_ps(xx, xmm1); |
|
xx = _mm_add_ps(xx, xmm2); |
|
xx = _mm_add_ps(xx, xmm3); |
|
|
|
/* Evaluate the first polynom (0 <= x <= Pi/4) */ |
|
y = *(v4sf*)_ps_coscof_p0; |
|
v4sf z = _mm_mul_ps(xx,xx); |
|
|
|
y = _mm_mul_ps(y, z); |
|
y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1); |
|
y = _mm_mul_ps(y, z); |
|
y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2); |
|
y = _mm_mul_ps(y, z); |
|
y = _mm_mul_ps(y, z); |
|
v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5); |
|
y = _mm_sub_ps(y, tmp); |
|
y = _mm_add_ps(y, *(v4sf*)_ps_1); |
|
|
|
/* Evaluate the second polynom (Pi/4 <= x <= 0) */ |
|
|
|
v4sf y2 = *(v4sf*)_ps_sincof_p0; |
|
y2 = _mm_mul_ps(y2, z); |
|
y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1); |
|
y2 = _mm_mul_ps(y2, z); |
|
y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2); |
|
y2 = _mm_mul_ps(y2, z); |
|
y2 = _mm_mul_ps(y2, xx); |
|
y2 = _mm_add_ps(y2, xx); |
|
|
|
/* select the correct result from the two polynoms */ |
|
xmm3 = poly_mask; |
|
y2 = _mm_and_ps(xmm3, y2); //, xmm3); |
|
y = _mm_andnot_ps(xmm3, y); |
|
y = _mm_add_ps(y,y2); |
|
/* update the sign */ |
|
|
|
_mm_store_ss( &x, _mm_xor_ps(y, sign_bit) ); |
|
|
|
#else |
|
#error "Not Implemented" |
|
#endif |
|
|
|
return x; |
|
} |
|
|
|
//----------------------------------------------------------------------------- |
|
// SSE2 implementations of optimized routines: |
|
//----------------------------------------------------------------------------- |
|
#ifdef PLATFORM_WINDOWS_PC32 |
|
void _SSE2_SinCos(float x, float* s, float* c) // any x |
|
{ |
|
#ifdef _WIN32 |
|
__asm |
|
{ |
|
movss xmm0, x |
|
movaps xmm7, xmm0 |
|
movss xmm1, _ps_am_inv_sign_mask |
|
movss xmm2, _ps_am_sign_mask |
|
movss xmm3, _ps_am_2_o_pi |
|
andps xmm0, xmm1 |
|
andps xmm7, xmm2 |
|
mulss xmm0, xmm3 |
|
|
|
pxor xmm3, xmm3 |
|
movd xmm5, _epi32_1 |
|
movss xmm4, _ps_am_1 |
|
|
|
cvttps2dq xmm2, xmm0 |
|
pand xmm5, xmm2 |
|
movd xmm1, _epi32_2 |
|
pcmpeqd xmm5, xmm3 |
|
movd xmm3, _epi32_1 |
|
cvtdq2ps xmm6, xmm2 |
|
paddd xmm3, xmm2 |
|
pand xmm2, xmm1 |
|
pand xmm3, xmm1 |
|
subss xmm0, xmm6 |
|
pslld xmm2, (31 - 1) |
|
minss xmm0, xmm4 |
|
|
|
mov eax, s // mov eax, [esp + 4 + 16] |
|
mov edx, c // mov edx, [esp + 4 + 16 + 4] |
|
|
|
subss xmm4, xmm0 |
|
pslld xmm3, (31 - 1) |
|
|
|
movaps xmm6, xmm4 |
|
xorps xmm2, xmm7 |
|
movaps xmm7, xmm5 |
|
andps xmm6, xmm7 |
|
andnps xmm7, xmm0 |
|
andps xmm0, xmm5 |
|
andnps xmm5, xmm4 |
|
movss xmm4, _ps_sincos_p3 |
|
orps xmm6, xmm7 |
|
orps xmm0, xmm5 |
|
movss xmm5, _ps_sincos_p2 |
|
|
|
movaps xmm1, xmm0 |
|
movaps xmm7, xmm6 |
|
mulss xmm0, xmm0 |
|
mulss xmm6, xmm6 |
|
orps xmm1, xmm2 |
|
orps xmm7, xmm3 |
|
movaps xmm2, xmm0 |
|
movaps xmm3, xmm6 |
|
mulss xmm0, xmm4 |
|
mulss xmm6, xmm4 |
|
movss xmm4, _ps_sincos_p1 |
|
addss xmm0, xmm5 |
|
addss xmm6, xmm5 |
|
movss xmm5, _ps_sincos_p0 |
|
mulss xmm0, xmm2 |
|
mulss xmm6, xmm3 |
|
addss xmm0, xmm4 |
|
addss xmm6, xmm4 |
|
mulss xmm0, xmm2 |
|
mulss xmm6, xmm3 |
|
addss xmm0, xmm5 |
|
addss xmm6, xmm5 |
|
mulss xmm0, xmm1 |
|
mulss xmm6, xmm7 |
|
|
|
// use full stores since caller might reload with full loads |
|
movss [eax], xmm0 |
|
movss [edx], xmm6 |
|
} |
|
#elif POSIX |
|
#warning "_SSE2_SinCos NOT implemented!" |
|
Assert( 0 ); |
|
#else |
|
#error "Not Implemented" |
|
#endif |
|
} |
|
#endif // PLATFORM_WINDOWS_PC32 |
|
|
|
#ifdef PLATFORM_WINDOWS_PC32 |
|
float _SSE2_cos(float x) |
|
{ |
|
#ifdef _WIN32 |
|
__asm |
|
{ |
|
movss xmm0, x |
|
movss xmm1, _ps_am_inv_sign_mask |
|
movss xmm2, _ps_am_pi_o_2 |
|
movss xmm3, _ps_am_2_o_pi |
|
andps xmm0, xmm1 |
|
addss xmm0, xmm2 |
|
mulss xmm0, xmm3 |
|
|
|
pxor xmm3, xmm3 |
|
movd xmm5, _epi32_1 |
|
movss xmm4, _ps_am_1 |
|
cvttps2dq xmm2, xmm0 |
|
pand xmm5, xmm2 |
|
movd xmm1, _epi32_2 |
|
pcmpeqd xmm5, xmm3 |
|
cvtdq2ps xmm6, xmm2 |
|
pand xmm2, xmm1 |
|
pslld xmm2, (31 - 1) |
|
|
|
subss xmm0, xmm6 |
|
movss xmm3, _ps_sincos_p3 |
|
minss xmm0, xmm4 |
|
subss xmm4, xmm0 |
|
andps xmm0, xmm5 |
|
andnps xmm5, xmm4 |
|
orps xmm0, xmm5 |
|
|
|
movaps xmm1, xmm0 |
|
movss xmm4, _ps_sincos_p2 |
|
mulss xmm0, xmm0 |
|
movss xmm5, _ps_sincos_p1 |
|
orps xmm1, xmm2 |
|
movaps xmm7, xmm0 |
|
mulss xmm0, xmm3 |
|
movss xmm6, _ps_sincos_p0 |
|
addss xmm0, xmm4 |
|
mulss xmm0, xmm7 |
|
addss xmm0, xmm5 |
|
mulss xmm0, xmm7 |
|
addss xmm0, xmm6 |
|
mulss xmm0, xmm1 |
|
movss x, xmm0 |
|
} |
|
#elif POSIX |
|
#warning "_SSE2_cos NOT implemented!" |
|
Assert( 0 ); |
|
#else |
|
#error "Not Implemented" |
|
#endif |
|
|
|
return x; |
|
} |
|
#endif // PLATFORM_WINDOWS_PC32 |
|
|
|
#if 0 |
|
// SSE Version of VectorTransform |
|
void VectorTransformSSE(const float *in1, const matrix3x4_t& in2, float *out1) |
|
{ |
|
Assert( s_bMathlibInitialized ); |
|
Assert( in1 != out1 ); |
|
|
|
#ifdef _WIN32 |
|
__asm |
|
{ |
|
mov eax, in1; |
|
mov ecx, in2; |
|
mov edx, out1; |
|
|
|
movss xmm0, [eax]; |
|
mulss xmm0, [ecx]; |
|
movss xmm1, [eax+4]; |
|
mulss xmm1, [ecx+4]; |
|
movss xmm2, [eax+8]; |
|
mulss xmm2, [ecx+8]; |
|
addss xmm0, xmm1; |
|
addss xmm0, xmm2; |
|
addss xmm0, [ecx+12] |
|
movss [edx], xmm0; |
|
add ecx, 16; |
|
|
|
movss xmm0, [eax]; |
|
mulss xmm0, [ecx]; |
|
movss xmm1, [eax+4]; |
|
mulss xmm1, [ecx+4]; |
|
movss xmm2, [eax+8]; |
|
mulss xmm2, [ecx+8]; |
|
addss xmm0, xmm1; |
|
addss xmm0, xmm2; |
|
addss xmm0, [ecx+12] |
|
movss [edx+4], xmm0; |
|
add ecx, 16; |
|
|
|
movss xmm0, [eax]; |
|
mulss xmm0, [ecx]; |
|
movss xmm1, [eax+4]; |
|
mulss xmm1, [ecx+4]; |
|
movss xmm2, [eax+8]; |
|
mulss xmm2, [ecx+8]; |
|
addss xmm0, xmm1; |
|
addss xmm0, xmm2; |
|
addss xmm0, [ecx+12] |
|
movss [edx+8], xmm0; |
|
} |
|
#elif POSIX |
|
#warning "VectorTransformSSE C implementation only" |
|
out1[0] = DotProduct(in1, in2[0]) + in2[0][3]; |
|
out1[1] = DotProduct(in1, in2[1]) + in2[1][3]; |
|
out1[2] = DotProduct(in1, in2[2]) + in2[2][3]; |
|
#else |
|
#error "Not Implemented" |
|
#endif |
|
} |
|
#endif |
|
|
|
#if 0 |
|
void VectorRotateSSE( const float *in1, const matrix3x4_t& in2, float *out1 ) |
|
{ |
|
Assert( s_bMathlibInitialized ); |
|
Assert( in1 != out1 ); |
|
|
|
#ifdef _WIN32 |
|
__asm |
|
{ |
|
mov eax, in1; |
|
mov ecx, in2; |
|
mov edx, out1; |
|
|
|
movss xmm0, [eax]; |
|
mulss xmm0, [ecx]; |
|
movss xmm1, [eax+4]; |
|
mulss xmm1, [ecx+4]; |
|
movss xmm2, [eax+8]; |
|
mulss xmm2, [ecx+8]; |
|
addss xmm0, xmm1; |
|
addss xmm0, xmm2; |
|
movss [edx], xmm0; |
|
add ecx, 16; |
|
|
|
movss xmm0, [eax]; |
|
mulss xmm0, [ecx]; |
|
movss xmm1, [eax+4]; |
|
mulss xmm1, [ecx+4]; |
|
movss xmm2, [eax+8]; |
|
mulss xmm2, [ecx+8]; |
|
addss xmm0, xmm1; |
|
addss xmm0, xmm2; |
|
movss [edx+4], xmm0; |
|
add ecx, 16; |
|
|
|
movss xmm0, [eax]; |
|
mulss xmm0, [ecx]; |
|
movss xmm1, [eax+4]; |
|
mulss xmm1, [ecx+4]; |
|
movss xmm2, [eax+8]; |
|
mulss xmm2, [ecx+8]; |
|
addss xmm0, xmm1; |
|
addss xmm0, xmm2; |
|
movss [edx+8], xmm0; |
|
} |
|
#elif POSIX |
|
#warning "VectorRotateSSE C implementation only" |
|
out1[0] = DotProduct( in1, in2[0] ); |
|
out1[1] = DotProduct( in1, in2[1] ); |
|
out1[2] = DotProduct( in1, in2[2] ); |
|
#else |
|
#error "Not Implemented" |
|
#endif |
|
} |
|
#endif |
|
|
|
#ifdef _WIN32 |
|
void _declspec(naked) _SSE_VectorMA( const float *start, float scale, const float *direction, float *dest ) |
|
{ |
|
// FIXME: This don't work!! It will overwrite memory in the write to dest |
|
Assert(0); |
|
|
|
Assert( s_bMathlibInitialized ); |
|
_asm { // Intel SSE only routine |
|
mov eax, DWORD PTR [esp+0x04] ; *start, s0..s2 |
|
mov ecx, DWORD PTR [esp+0x0c] ; *direction, d0..d2 |
|
mov edx, DWORD PTR [esp+0x10] ; *dest |
|
movss xmm2, [esp+0x08] ; x2 = scale, 0, 0, 0 |
|
#ifdef ALIGNED_VECTOR |
|
movaps xmm3, [ecx] ; x3 = dir0,dir1,dir2,X |
|
pshufd xmm2, xmm2, 0 ; x2 = scale, scale, scale, scale |
|
movaps xmm1, [eax] ; x1 = start1, start2, start3, X |
|
mulps xmm3, xmm2 ; x3 *= x2 |
|
addps xmm3, xmm1 ; x3 += x1 |
|
movaps [edx], xmm3 ; *dest = x3 |
|
#else |
|
movups xmm3, [ecx] ; x3 = dir0,dir1,dir2,X |
|
pshufd xmm2, xmm2, 0 ; x2 = scale, scale, scale, scale |
|
movups xmm1, [eax] ; x1 = start1, start2, start3, X |
|
mulps xmm3, xmm2 ; x3 *= x2 |
|
addps xmm3, xmm1 ; x3 += x1 |
|
movups [edx], xmm3 ; *dest = x3 |
|
#endif |
|
} |
|
} |
|
#endif |
|
|
|
#ifdef _WIN32 |
|
#ifdef PFN_VECTORMA |
|
void _declspec(naked) __cdecl _SSE_VectorMA( const Vector &start, float scale, const Vector &direction, Vector &dest ) |
|
{ |
|
// FIXME: This don't work!! It will overwrite memory in the write to dest |
|
Assert(0); |
|
|
|
Assert( s_bMathlibInitialized ); |
|
_asm |
|
{ |
|
// Intel SSE only routine |
|
mov eax, DWORD PTR [esp+0x04] ; *start, s0..s2 |
|
mov ecx, DWORD PTR [esp+0x0c] ; *direction, d0..d2 |
|
mov edx, DWORD PTR [esp+0x10] ; *dest |
|
movss xmm2, [esp+0x08] ; x2 = scale, 0, 0, 0 |
|
#ifdef ALIGNED_VECTOR |
|
movaps xmm3, [ecx] ; x3 = dir0,dir1,dir2,X |
|
pshufd xmm2, xmm2, 0 ; x2 = scale, scale, scale, scale |
|
movaps xmm1, [eax] ; x1 = start1, start2, start3, X |
|
mulps xmm3, xmm2 ; x3 *= x2 |
|
addps xmm3, xmm1 ; x3 += x1 |
|
movaps [edx], xmm3 ; *dest = x3 |
|
#else |
|
movups xmm3, [ecx] ; x3 = dir0,dir1,dir2,X |
|
pshufd xmm2, xmm2, 0 ; x2 = scale, scale, scale, scale |
|
movups xmm1, [eax] ; x1 = start1, start2, start3, X |
|
mulps xmm3, xmm2 ; x3 *= x2 |
|
addps xmm3, xmm1 ; x3 += x1 |
|
movups [edx], xmm3 ; *dest = x3 |
|
#endif |
|
} |
|
} |
|
float (__cdecl *pfVectorMA)(Vector& v) = _VectorMA; |
|
#endif |
|
#endif |
|
|
|
|
|
// SSE DotProduct -- it's a smidgen faster than the asm DotProduct... |
|
// Should be validated too! :) |
|
// NJS: (Nov 1 2002) -NOT- faster. may time a couple cycles faster in a single function like |
|
// this, but when inlined, and instruction scheduled, the C version is faster. |
|
// Verified this via VTune |
|
/* |
|
vec_t DotProduct (const vec_t *a, const vec_t *c) |
|
{ |
|
vec_t temp; |
|
|
|
__asm |
|
{ |
|
mov eax, a; |
|
mov ecx, c; |
|
mov edx, DWORD PTR [temp] |
|
movss xmm0, [eax]; |
|
mulss xmm0, [ecx]; |
|
movss xmm1, [eax+4]; |
|
mulss xmm1, [ecx+4]; |
|
movss xmm2, [eax+8]; |
|
mulss xmm2, [ecx+8]; |
|
addss xmm0, xmm1; |
|
addss xmm0, xmm2; |
|
movss [edx], xmm0; |
|
fld DWORD PTR [edx]; |
|
ret |
|
} |
|
} |
|
*/ |
|
|
|
#endif // COMPILER_MSVC64
|
|
|