You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
321 lines
10 KiB
321 lines
10 KiB
//========= Copyright Valve Corporation, All rights reserved. ============// |
|
// |
|
// Purpose: |
|
// |
|
//===========================================================================// |
|
|
|
#include "tier0/platform.h" |
|
#include "tier0/progressbar.h" |
|
#include "bitmap/float_bm.h" |
|
#include "mathlib/mathlib.h" |
|
#include "tier2/tier2.h" |
|
#include "tier0/memdbgon.h" |
|
#include "mathlib/ssemath.h" |
|
|
|
#ifdef _X360 |
|
#include "xbox/xbox_console.h" |
|
#endif |
|
|
|
|
|
|
|
#define PROBLEM_SIZE 1000 |
|
#define N_ITERS 100000 |
|
//#define RECORD_OUTPUT |
|
|
|
|
|
static FourVectors g_XYZ[PROBLEM_SIZE]; |
|
static fltx4 g_CreationTime[PROBLEM_SIZE]; |
|
|
|
|
|
|
|
bool SIMDTest() |
|
{ |
|
const Vector StartPnt(0,0,0); |
|
const Vector MidP(0,0,100); |
|
const Vector EndPnt(100,0,50); |
|
|
|
// This app doesn't go through regular engine init, so init FPU/VPU math behaviour here: |
|
SetupFPUControlWord(); |
|
TestVPUFlags(); |
|
|
|
// Initialize g_XYZ[] and g_CreationTime[] |
|
SeedRandSIMD(1987301); |
|
for (int i = 0;i < PROBLEM_SIZE;i++) |
|
{ |
|
float fourStartTimes[4]; |
|
Vector fourPoints[4]; |
|
Vector offset; |
|
for (int j = 0;j < 4;j++) |
|
{ |
|
float t = (j + 4 * i) / (4.0f * (PROBLEM_SIZE - 1)); |
|
fourStartTimes[j] = t; |
|
fourPoints[j] = StartPnt + t*( EndPnt - StartPnt ); |
|
offset.Random( -10.0f, +10.0f ); |
|
fourPoints[j] += offset; |
|
} |
|
g_XYZ[i].LoadAndSwizzle( fourPoints[0], fourPoints[1], fourPoints[2], fourPoints[3] ); |
|
g_CreationTime[i] = LoadUnalignedSIMD( fourStartTimes ); |
|
} |
|
|
|
#ifdef RECORD_OUTPUT |
|
char outputBuffer[1024]; |
|
Q_snprintf( outputBuffer, sizeof( outputBuffer ), "float testOutput[%d][4][3] = {\n", N_ITERS ); |
|
Warning(outputBuffer); |
|
#endif // RECORD_OUTPUT |
|
|
|
double STime=Plat_FloatTime(); |
|
bool bChangedSomething = false; |
|
for(int i=0;i<N_ITERS;i++) |
|
{ |
|
float t=i*(1.0/N_ITERS); |
|
FourVectors * __restrict pXYZ = g_XYZ; |
|
|
|
fltx4 * __restrict pCreationTime = g_CreationTime; |
|
|
|
fltx4 CurTime = ReplicateX4( t ); |
|
fltx4 TimeScale = ReplicateX4( 1.0/(max(0.001, 1.0 ) ) ); |
|
|
|
// calculate radius spline |
|
bool bConstantRadius = true; |
|
fltx4 Rad0=ReplicateX4(2.0); |
|
fltx4 Radm=Rad0; |
|
fltx4 Rad1=Rad0; |
|
|
|
fltx4 RadmMinusRad0=SubSIMD( Radm, Rad0); |
|
fltx4 Rad1MinusRadm=SubSIMD( Rad1, Radm); |
|
|
|
fltx4 SIMDMinDist=ReplicateX4( 2.0 ); |
|
fltx4 SIMDMinDist2=ReplicateX4( 2.0*2.0 ); |
|
|
|
fltx4 SIMDMaxDist=MaxSIMD( Rad0, MaxSIMD( Radm, Rad1 ) ); |
|
fltx4 SIMDMaxDist2=MulSIMD( SIMDMaxDist, SIMDMaxDist); |
|
|
|
|
|
FourVectors StartP; |
|
StartP.DuplicateVector( StartPnt ); |
|
|
|
FourVectors MiddleP; |
|
MiddleP.DuplicateVector( MidP ); |
|
|
|
// form delta terms needed for quadratic bezier |
|
FourVectors Delta0; |
|
Delta0.DuplicateVector( MidP-StartPnt ); |
|
|
|
FourVectors Delta1; |
|
Delta1.DuplicateVector( EndPnt-MidP ); |
|
int nLoopCtr = PROBLEM_SIZE; |
|
do |
|
{ |
|
fltx4 TScale=MinSIMD( |
|
Four_Ones, |
|
MulSIMD( TimeScale, SubSIMD( CurTime, *pCreationTime ) ) ); |
|
|
|
// bezier(a,b,c,t)=lerp( lerp(a,b,t),lerp(b,c,t),t) |
|
FourVectors L0 = Delta0; |
|
L0 *= TScale; |
|
L0 += StartP; |
|
|
|
FourVectors L1= Delta1; |
|
L1 *= TScale; |
|
L1 += MiddleP; |
|
|
|
FourVectors Center = L1; |
|
Center -= L0; |
|
Center *= TScale; |
|
Center += L0; |
|
|
|
FourVectors pts_original = *(pXYZ); |
|
FourVectors pts = pts_original; |
|
pts -= Center; |
|
|
|
// calculate radius at the point. !!speed!! - use special case for constant radius |
|
|
|
fltx4 dist_squared= pts * pts; |
|
fltx4 TooFarMask = CmpGtSIMD( dist_squared, SIMDMaxDist2 ); |
|
if ( ( !bConstantRadius) && ( ! IsAnyNegative( TooFarMask ) ) ) |
|
{ |
|
// need to calculate and adjust for true radius =- we've only trivially rejected note |
|
// voodoo here - we update simdmaxdist for true radius, but not max dist^2, since |
|
// that's used only for the trivial reject case, which we've already done |
|
fltx4 R0=AddSIMD( Rad0, MulSIMD( RadmMinusRad0, TScale ) ); |
|
fltx4 R1=AddSIMD( Radm, MulSIMD( Rad1MinusRadm, TScale ) ); |
|
SIMDMaxDist = AddSIMD( R0, MulSIMD( SubSIMD( R1, R0 ), TScale) ); |
|
|
|
// now that we know the true radius, update our mask |
|
TooFarMask = CmpGtSIMD( dist_squared, MulSIMD( SIMDMaxDist, SIMDMaxDist ) ); |
|
} |
|
|
|
fltx4 TooCloseMask = CmpLtSIMD( dist_squared, SIMDMinDist2 ); |
|
fltx4 NeedAdjust = OrSIMD( TooFarMask, TooCloseMask ); |
|
if ( IsAnyNegative( NeedAdjust ) ) // any out of bounds? |
|
{ |
|
// change squared distance into approximate rsqr root |
|
fltx4 guess=ReciprocalSqrtEstSIMD(dist_squared); |
|
// newton iteration for 1/sqrt(x) : y(n+1)=1/2 (y(n)*(3-x*y(n)^2)); |
|
guess=MulSIMD(guess,SubSIMD(Four_Threes,MulSIMD(dist_squared,MulSIMD(guess,guess)))); |
|
guess=MulSIMD(Four_PointFives,guess); |
|
pts *= guess; |
|
|
|
FourVectors clamp_far=pts; |
|
clamp_far *= SIMDMaxDist; |
|
clamp_far += Center; |
|
FourVectors clamp_near=pts; |
|
clamp_near *= SIMDMinDist; |
|
clamp_near += Center; |
|
pts.x = MaskedAssign( TooCloseMask, clamp_near.x, MaskedAssign( TooFarMask, clamp_far.x, pts_original.x )); |
|
pts.y = MaskedAssign( TooCloseMask, clamp_near.y, MaskedAssign( TooFarMask, clamp_far.y, pts_original.y )); |
|
pts.z = MaskedAssign( TooCloseMask, clamp_near.z, MaskedAssign( TooFarMask, clamp_far.z, pts_original.z )); |
|
*(pXYZ) = pts; |
|
bChangedSomething = true; |
|
} |
|
|
|
#ifdef RECORD_OUTPUT |
|
if (nLoopCtr == 257) |
|
{ |
|
Q_snprintf( outputBuffer, sizeof( outputBuffer ), "/*%04d:*/ { {%+14e,%+14e,%+14e}, {%+14e,%+14e,%+14e}, {%+14e,%+14e,%+14e}, {%+14e,%+14e,%+14e} },\n", i, |
|
pXYZ->X(0), pXYZ->Y(0), pXYZ->Z(0), |
|
pXYZ->X(1), pXYZ->Y(1), pXYZ->Z(1), |
|
pXYZ->X(2), pXYZ->Y(2), pXYZ->Z(2), |
|
pXYZ->X(3), pXYZ->Y(3), pXYZ->Z(3)); |
|
Warning(outputBuffer); |
|
} |
|
#endif // RECORD_OUTPUT |
|
|
|
++pXYZ; |
|
++pCreationTime; |
|
} while ( --nLoopCtr ); |
|
} |
|
double ETime=Plat_FloatTime()-STime; |
|
|
|
#ifdef RECORD_OUTPUT |
|
Q_snprintf( outputBuffer, sizeof( outputBuffer ), " };\n" ); |
|
Warning(outputBuffer); |
|
#endif // RECORD_OUTPUT |
|
|
|
printf("elapsed time=%f p/s=%f\n",ETime, (4.0*PROBLEM_SIZE*N_ITERS)/ETime ); |
|
return bChangedSomething; |
|
} |
|
|
|
|
|
#ifdef _X360 |
|
|
|
__declspec(passinreg) struct float4 |
|
{ |
|
operator __vector4 () const { return vmx; } |
|
__vector4 vmx; |
|
}; |
|
|
|
void OctoberXDKCompilerIssueTestCode( const fltx4 & val, fltx4 * out ) |
|
{ |
|
// UNDONE: This code demonstrates serious 360 compiler issues. XBox Developer Support has been contacted. |
|
// The assembly contains tons of useless instructions (vector stores and supporting integer math), even in the |
|
// below code - no use of pointers or static constants, no wrapper layers on top of the vector intrinsics. |
|
// If/when the compiler issue is resolved, other known issues are: |
|
// - pass vector params by const reference |
|
// - avoid putting __vector4 in a union or an array |
|
// - avoid default constructors, return constructed objects directly ("return VecClass(__vector4Val);") |
|
|
|
#define DECL_ASS( _var_, _val_ ) fltx4 _var_ = _val_ |
|
//#define DECL_ASS( _var_, _val_ ) float4 _var_; _var_.vmx = _val_ |
|
//#define DECL_ASS( _var_, _val_ ) float4 _var_( _val_ ) |
|
|
|
DECL_ASS( resultx, Four_Zeros ); DECL_ASS( resulty, Four_Zeros ); DECL_ASS( resultz, Four_Zeros ); |
|
|
|
DECL_ASS( CurTime, __vmulfp( val, Four_PointFives ) ); |
|
DECL_ASS( TimeScale, val ); |
|
//fltx4 *pCreationTime = g_CreationTime; |
|
DECL_ASS( Delta0x, val ); DECL_ASS( Delta0y, val ); DECL_ASS( Delta0z, val ); |
|
DECL_ASS( Delta1x, __vaddfp(Delta0x, Delta0x) ); DECL_ASS( Delta1y, __vaddfp(Delta0y, Delta0y) ); DECL_ASS( Delta1z, __vaddfp(Delta0z, Delta0z) ); |
|
DECL_ASS( StartPx, __vaddfp(Delta0x, Delta0x) ); DECL_ASS( StartPy, __vaddfp(Delta0y, Delta0y) ); DECL_ASS( StartPz, __vaddfp(Delta0z, Delta0z) ); |
|
DECL_ASS( MiddlePx, __vaddfp(StartPx, StartPx) ); DECL_ASS( MiddlePy, __vaddfp(StartPy, StartPy) ); DECL_ASS( MiddlePz, __vaddfp(StartPz, StartPz) ); |
|
for (int i = 0;i < 1000;i++) |
|
{ |
|
DECL_ASS( TScale, __vsubfp( CurTime, resultx ) );//*pCreationTime ); |
|
TScale = __vmulfp( TScale, TimeScale ); |
|
TScale = __vminfp( TScale, resulty );//Four_Ones ); |
|
|
|
//resultx = __vaddfp( resultx, TScale ); |
|
//resulty = __vaddfp( resulty, TScale ); |
|
//resultz = __vaddfp( resultz, TScale ); |
|
|
|
DECL_ASS( L0x, Delta0x ); DECL_ASS( L0y, Delta0y ); DECL_ASS( L0z, Delta0z ); |
|
L0x = __vmulfp(L0x,TScale); L0y = __vmulfp(L0y,TScale); L0z = __vmulfp(L0z,TScale); |
|
L0x = __vaddfp(StartPx,L0x); L0y = __vaddfp(StartPy,L0y); L0z = __vaddfp(StartPz,L0z); |
|
|
|
DECL_ASS( L1x, Delta1x ); DECL_ASS( L1y, Delta1y ); DECL_ASS( L1z, Delta1z ); |
|
L1x = __vmulfp(L1x,TScale); L1y = __vmulfp(L1y,TScale); L1z = __vmulfp(L1z,TScale); |
|
L1x = __vaddfp(MiddlePx,L1x); L1y = __vaddfp(MiddlePy,L1y); L1z = __vaddfp(MiddlePz,L1z); |
|
|
|
L0x = __vaddfp(L0x,L1x); L0y = __vaddfp(L0y,L1y); L0z = __vaddfp(L0z,L1z); |
|
|
|
resultx = __vaddfp( resultx, L0x ); |
|
resulty = __vaddfp( resulty, L0y ); |
|
resultz = __vaddfp( resultz, L0z ); |
|
|
|
//pCreationTime++; |
|
} |
|
|
|
out[0] = resultx; |
|
out[1] = resulty; |
|
out[2] = resultz; |
|
} |
|
|
|
#else // _X360 |
|
|
|
void |
|
SSEClassTest( const fltx4 & val, fltx4 & out ) |
|
{ |
|
fltx4 result = Four_Zeros; |
|
for (int i = 0;i < N_ITERS;i++) |
|
{ |
|
result = SubSIMD( val, result ); |
|
result = MulSIMD( val, result ); |
|
result = AddSIMD( val, result ); |
|
result = MinSIMD( val, result ); |
|
} |
|
FourVectors result4; result4.x = result; result4.y = result; result4.z = result; |
|
for (int i = 0;i < N_ITERS;i++) |
|
{ |
|
result4 *= result4; |
|
result4 += result4; |
|
result4 *= result4; |
|
result4 += result4; |
|
} |
|
result = result4*result4; |
|
out = result; |
|
} |
|
|
|
#endif // !_X360 |
|
|
|
|
|
int main(int argc,char **argv) |
|
{ |
|
#ifndef _X360 |
|
|
|
// UNDONE: InitCommandLineProgram needs fixing for 360 (if we want to make lots of new 360 executables) |
|
InitCommandLineProgram( argc, argv ); |
|
|
|
// This function is useful for inspecting compiler output |
|
fltx4 result; |
|
SSEClassTest( Four_PointFives, result ); |
|
printf("(%f,%f,%f,%f)\n", SubFloat( result, 0 ), SubFloat( result, 1 ), SubFloat( result, 2 ), SubFloat( result, 3 ) ); |
|
|
|
#else // _X360 |
|
|
|
// Wait for VXConsole, so that all debug output goes there |
|
XBX_InitConsoleMonitor(true); |
|
|
|
// This function is useful for inspecting compiler output |
|
FourVectors result; |
|
OctoberXDKCompilerIssueTestCode( Four_PointFives, (fltx4 *)&result ); |
|
printf("(%f,%f,%f,%f)\n", result.X(0), result.X(1), result.X(2), result.X(3)); |
|
printf("(%f,%f,%f,%f)\n", result.Y(0), result.Y(1), result.Y(2), result.Y(3)); |
|
printf("(%f,%f,%f,%f)\n", result.Z(0), result.Z(1), result.Z(2), result.Z(3)); |
|
|
|
#endif // _X360 |
|
|
|
// Run the perf. test |
|
SIMDTest(); |
|
|
|
return 0; |
|
}
|
|
|