Merge pull request #248 from nillerusr/mathlib-optimize

Mathlib optimize
2 years ago · 697a9f34f9
11 changed files with 1522 additions and 1775 deletions
--- a/.gitignore
+++ b/.gitignore
@ -37,5 +37,3 @@ waf3*/
 .vscode/
 .depproj/
 source-engine.sln
 hl2/
--- a/game/server/hl2/ai_behavior_police.cpp
+++ b/game/server/hl2/ai_behavior_police.cpp
@ -33,6 +33,7 @@ CAI_PolicingBehavior::CAI_PolicingBehavior( void )
 	m_bEnabled = false;
 	m_nNumWarnings = 0;
 	m_bTargetIsHostile = false;
 	m_hPoliceGoal = NULL;
 }
 //-----------------------------------------------------------------------------
--- a/materialsystem/imaterialsysteminternal.h
+++ b/materialsystem/imaterialsysteminternal.h
@ -31,9 +31,9 @@ public:
 	{
 		MEM_ALLOC_CREDIT_( "CMatCallQueue.m_Allocator" );
 #ifdef SWDS
-		m_Allocator.Init( 2*1024, 0, 0, 4 );
+		m_Allocator.Init( 2*1024, 0, 0, 16 );
 #else
-		m_Allocator.Init( IsX360() ? 2*1024*1024 : 8*1024*1024, 64*1024, 256*1024, 4 );
+		m_Allocator.Init( IsX360() ? 2*1024*1024 : 8*1024*1024, 64*1024, 256*1024, 16 );
 #endif
 		m_FunctorFactory.SetAllocator( &m_Allocator );
 		m_pHead = m_pTail = NULL;
--- a/mathlib/mathlib_base.cpp
+++ b/mathlib/mathlib_base.cpp
@ -420,13 +420,6 @@ void MatrixGetColumn( const matrix3x4_t& in, int column, Vector &out )
 	out.z = in[2][column];
 }
 void MatrixSetColumn( const Vector &in, int column, matrix3x4_t& out )
 {
 	out[0][column] = in.x;
 	out[1][column] = in.y;
 	out[2][column] = in.z;
 }
 void MatrixScaleBy ( const float flScale, matrix3x4_t &out )
 {
 	out[0][0] *= flScale;
@ -1092,57 +1085,6 @@ void SetScaleMatrix( float x, float y, float z, matrix3x4_t &dst )
 	dst[2][0] = 0.0f;	dst[2][1] = 0.0f;	dst[2][2] = z;		dst[2][3] = 0.0f;
 }
 //-----------------------------------------------------------------------------
 // Purpose: Builds the matrix for a counterclockwise rotation about an arbitrary axis.
 //
 //		   | ax2 + (1 - ax2)cosQ		axay(1 - cosQ) - azsinQ		azax(1 - cosQ) + aysinQ |
 // Ra(Q) = | axay(1 - cosQ) + azsinQ	ay2 + (1 - ay2)cosQ			ayaz(1 - cosQ) - axsinQ |
 //		   | azax(1 - cosQ) - aysinQ	ayaz(1 - cosQ) + axsinQ		az2 + (1 - az2)cosQ     |
 //          
 // Input  : mat - 
 //			vAxisOrRot - 
 //			angle - 
 //-----------------------------------------------------------------------------
 void MatrixBuildRotationAboutAxis( const Vector &vAxisOfRot, float angleDegrees, matrix3x4_t &dst )
 {
 	float radians;
 	float axisXSquared;
 	float axisYSquared;
 	float axisZSquared;
 	float fSin;
 	float fCos;
 	radians = angleDegrees * ( M_PI / 180.0 );
 	fSin = sin( radians );
 	fCos = cos( radians );
 	axisXSquared = vAxisOfRot[0] * vAxisOfRot[0];
 	axisYSquared = vAxisOfRot[1] * vAxisOfRot[1];
 	axisZSquared = vAxisOfRot[2] * vAxisOfRot[2];
 	// Column 0:
 	dst[0][0] = axisXSquared + (1 - axisXSquared) * fCos;
 	dst[1][0] = vAxisOfRot[0] * vAxisOfRot[1] * (1 - fCos) + vAxisOfRot[2] * fSin;
 	dst[2][0] = vAxisOfRot[2] * vAxisOfRot[0] * (1 - fCos) - vAxisOfRot[1] * fSin;
 	// Column 1:
 	dst[0][1] = vAxisOfRot[0] * vAxisOfRot[1] * (1 - fCos) - vAxisOfRot[2] * fSin;
 	dst[1][1] = axisYSquared + (1 - axisYSquared) * fCos;
 	dst[2][1] = vAxisOfRot[1] * vAxisOfRot[2] * (1 - fCos) + vAxisOfRot[0] * fSin;
 	// Column 2:
 	dst[0][2] = vAxisOfRot[2] * vAxisOfRot[0] * (1 - fCos) + vAxisOfRot[1] * fSin;
 	dst[1][2] = vAxisOfRot[1] * vAxisOfRot[2] * (1 - fCos) - vAxisOfRot[0] * fSin;
 	dst[2][2] = axisZSquared + (1 - axisZSquared) * fCos;
 	// Column 3:
 	dst[0][3] = 0;
 	dst[1][3] = 0;
 	dst[2][3] = 0;
 }
 //-----------------------------------------------------------------------------
 // Computes the transpose
 //-----------------------------------------------------------------------------
@ -1450,33 +1392,6 @@ void VectorYawRotate( const Vector &in, float flYaw, Vector &out)
 	out.z = in.z;
 }
 float Bias( float x, float biasAmt )
 {
 	// WARNING: not thread safe
 	static float lastAmt = -1;
 	static float lastExponent = 0;
 	if( lastAmt != biasAmt )
 	{
 		lastExponent = log( biasAmt ) * -1.4427f; // (-1.4427 = 1 / log(0.5))
 	}
 	float fRet = pow( x, lastExponent );
 	Assert ( !IS_NAN( fRet ) );
 	return fRet;
 }
 float Gain( float x, float biasAmt )
 {
 	// WARNING: not thread safe
 	if( x < 0.5 )
 		return 0.5f * Bias( 2*x, 1-biasAmt );
 	else
 		return 1 - 0.5f * Bias( 2 - 2*x, 1-biasAmt );
 }
 float SmoothCurve( float x )
 {
 	// Actual smooth curve. Visualization:
--- a/mathlib/vmatrix.cpp
+++ b/mathlib/vmatrix.cpp
--- a/public/mathlib/math_pfns.h
+++ b/public/mathlib/math_pfns.h
@ -22,10 +22,16 @@ extern float (*pfFastCos)(float x);
 // The following are not declared as macros because they are often used in limiting situations,
 // and sometimes the compiler simply refuses to inline them for some reason
-#define FastSqrt(x)			(*pfSqrt)(x)
+#define FastSqrt(x)			sqrtf(x)
-#define	FastRSqrt(x)		(*pfRSqrt)(x)
+#define	FastRSqrt(x)		(1.f/sqrtf(x))
-#define FastRSqrtFast(x)    (*pfRSqrtFast)(x)
+#define FastRSqrtFast(x)    (1.f/sqrtf(x))
 #ifdef _WIN32
 #define FastSinCos(x,s,c)   (*pfFastSinCos)(x,s,c)
 #else
 #define FastSinCos(x,s,c)   sincosf(x,s,c)
 #endif
 #define FastCos(x)			(*pfFastCos)(x)
 #if defined(__i386__) || defined(_M_IX86)
--- a/public/mathlib/mathlib.h
+++ b/public/mathlib/mathlib.h
@ -30,7 +30,6 @@
 // FP exception clean so this not a turnkey operation.
 //#define FP_EXCEPTIONS_ENABLED
 #ifdef FP_EXCEPTIONS_ENABLED
 #include <float.h> // For _clearfp and _controlfp_s
 #endif
@ -93,37 +92,11 @@ private:
 	FPExceptionEnabler& operator=(const FPExceptionEnabler&);
 };
-
+inline float clamp( const float val, const float minVal, const float maxVal )
 #ifdef DEBUG  // stop crashing edit-and-continue
 FORCEINLINE float clamp( float val, float minVal, float maxVal )
 {
 	if ( maxVal < minVal )
 		return maxVal;
 	else if( val < minVal )
 		return minVal;
 	else if( val > maxVal )
 		return maxVal;
 	else
 		return val;
 }
 #else // DEBUG
 FORCEINLINE float clamp( float val, float minVal, float maxVal )
 {
-#if defined(__i386__) || defined(_M_IX86)
+	const float t = val < minVal ? minVal : val;
-	_mm_store_ss( &val,
+	return t > maxVal ? maxVal : t;
 		_mm_min_ss(
 			_mm_max_ss(
 				_mm_load_ss(&val),
 				_mm_load_ss(&minVal) ),
 			_mm_load_ss(&maxVal) ) );
 #else
 	val = fpmax(minVal, val);
 	val = fpmin(maxVal, val);
 #endif
 	return val;
 }
 #endif // DEBUG
 //
 // Returns a clamped value in the range [min, max].
@ -131,17 +104,10 @@ FORCEINLINE float clamp( float val, float minVal, float maxVal )
 template< class T >
 inline T clamp( T const &val, T const &minVal, T const &maxVal )
 {
-	if ( maxVal < minVal )
+	const T t = val< minVal ? minVal : val;
-		return maxVal;
+	return t > maxVal ? maxVal : t;
 	else if( val < minVal )
 		return minVal;
 	else if( val > maxVal )
 		return maxVal;
 	else
 		return val;
 }
 // plane_t structure
 // !!! if this is changed, it must be changed in asm code too !!!
 // FIXME: does the asm code even exist anymore?
@ -237,8 +203,8 @@ bool R_CullBoxSkipNear( const Vector& mins, const Vector& maxs, const Frustum_t
 struct matrix3x4_t
 {
-	matrix3x4_t() = default;
+	inline matrix3x4_t() = default;
-	matrix3x4_t( 
+	inline matrix3x4_t(
 		float m00, float m01, float m02, float m03,
 		float m10, float m11, float m12, float m13,
 		float m20, float m21, float m22, float m23 )
@ -252,7 +218,7 @@ struct matrix3x4_t
 	// Creates a matrix where the X axis = forward
 	// the Y axis = left, and the Z axis = up
 	//-----------------------------------------------------------------------------
-	void Init( const Vector& xAxis, const Vector& yAxis, const Vector& zAxis, const Vector &vecOrigin )
+	inline void Init( const Vector& xAxis, const Vector& yAxis, const Vector& zAxis, const Vector &vecOrigin )
 	{
 		m_flMatVal[0][0] = xAxis.x; m_flMatVal[0][1] = yAxis.x; m_flMatVal[0][2] = zAxis.x; m_flMatVal[0][3] = vecOrigin.x;
 		m_flMatVal[1][0] = xAxis.y; m_flMatVal[1][1] = yAxis.y; m_flMatVal[1][2] = zAxis.y; m_flMatVal[1][3] = vecOrigin.y;
@ -263,26 +229,23 @@ struct matrix3x4_t
 	// Creates a matrix where the X axis = forward
 	// the Y axis = left, and the Z axis = up
 	//-----------------------------------------------------------------------------
-	matrix3x4_t( const Vector& xAxis, const Vector& yAxis, const Vector& zAxis, const Vector &vecOrigin )
+	inline matrix3x4_t( const Vector& xAxis, const Vector& yAxis, const Vector& zAxis, const Vector &vecOrigin )
 	{
 		Init( xAxis, yAxis, zAxis, vecOrigin );
 	}
 	inline void Invalidate( void )
 	{
-		for (int i = 0; i < 3; i++)
+		for( int i=0; i < 12; i++ )
 		{
-			for (int j = 0; j < 4; j++)
+			((float*)m_flMatVal)[i] = VEC_T_NAN;
 			{
 				m_flMatVal[i][j] = VEC_T_NAN;
 			}
 		}
 	}
-	float *operator[]( int i )				{ Assert(( i >= 0 ) && ( i < 3 )); return m_flMatVal[i]; }
+	inline float *operator[]( int i )				{ Assert(( i >= 0 ) && ( i < 3 )); return m_flMatVal[i]; }
-	const float *operator[]( int i ) const	{ Assert(( i >= 0 ) && ( i < 3 )); return m_flMatVal[i]; }
+	inline const float *operator[]( int i ) const	{ Assert(( i >= 0 ) && ( i < 3 )); return m_flMatVal[i]; }
-	float *Base()							{ return &m_flMatVal[0][0]; }
+	inline float *Base()							{ return &m_flMatVal[0][0]; }
-	const float *Base() const				{ return &m_flMatVal[0][0]; }
+	inline const float *Base() const				{ return &m_flMatVal[0][0]; }
 	float m_flMatVal[3][4];
 };
@ -565,7 +528,13 @@ void MatrixInvert( const matrix3x4_t &in, matrix3x4_t &out );
 bool MatricesAreEqual( const matrix3x4_t &src1, const matrix3x4_t &src2, float flTolerance = 1e-5 );
 void MatrixGetColumn( const matrix3x4_t &in, int column, Vector &out );
-void MatrixSetColumn( const Vector &in, int column, matrix3x4_t &out );
+
 inline void MatrixSetColumn( const Vector &in, int column, matrix3x4_t& out )
 {
 	out[0][column] = in.x;
 	out[1][column] = in.y;
 	out[2][column] = in.z;
 }
 inline void MatrixGetTranslation( const matrix3x4_t &in, Vector &out )
 {
@ -1079,7 +1048,19 @@ void VectorYawRotate( const Vector& in, float flYaw, Vector &out);
 // 0                   1
 //
 // With a biasAmt of 0.5, Bias returns X.
-float Bias( float x, float biasAmt );
+inline float Bias( float x, float biasAmt )
 {
 	// WARNING: not thread safe
 	static float lastAmt = -1;
 	static float lastExponent = 0;
 	if( lastAmt != biasAmt )
 	{
 		lastExponent = log( biasAmt ) * -1.4427f; // (-1.4427 = 1 / log(0.5))
 	}
 	float fRet = pow( x, lastExponent );
 	Assert ( !IS_NAN( fRet ) );
 	return fRet;
 }
 // Gain is similar to Bias, but biasAmt biases towards or away from 0.5.
@ -1111,9 +1092,14 @@ float Bias( float x, float biasAmt );
 // |*****
 // |___________________
 // 0                   1
-float Gain( float x, float biasAmt );
+inline float Gain( float x, float biasAmt )
-
+{
-
+	// WARNING: not thread safe
 	if( x < 0.5 )
 		return 0.5f * Bias( 2*x, 1-biasAmt );
 	else
 		return 1 - 0.5f * Bias( 2 - 2*x, 1-biasAmt );
 }
 // SmoothCurve maps a 0-1 value into another 0-1 value based on a cosine wave
 // where the derivatives of the function at 0 and 1 (and 0.5) are 0. This is useful for
 // any fadein/fadeout effect where it should start and end smoothly.
--- a/public/mathlib/vector4d.h
+++ b/public/mathlib/vector4d.h
@ -35,7 +35,7 @@ class Vector2D;
 // 4D Vector4D
 //=========================================================
-class Vector4D					
+class alignas(16) Vector4D					
 {
 public:
 	// Members
--- a/public/mathlib/vmatrix.h
+++ b/public/mathlib/vmatrix.h
--- a/public/togl/linuxwin/dxabstract_types.h
+++ b/public/togl/linuxwin/dxabstract_types.h
@ -1042,7 +1042,7 @@ typedef enum _D3DSHADER_PARAM_REGISTER_TYPE
    D3DSPR_FORCE_DWORD  = 0x7fffffff,         // force 32-bit size enum
 } D3DSHADER_PARAM_REGISTER_TYPE;
-struct D3DMATRIX 
+struct alignas(16) D3DMATRIX 
 {
    union 
 	{
--- a/public/togles/linuxwin/dxabstract_types.h
+++ b/public/togles/linuxwin/dxabstract_types.h
@ -1042,7 +1042,7 @@ typedef enum _D3DSHADER_PARAM_REGISTER_TYPE
    D3DSPR_FORCE_DWORD  = 0x7fffffff,         // force 32-bit size enum
 } D3DSHADER_PARAM_REGISTER_TYPE;
-struct D3DMATRIX 
+struct alignas(16) D3DMATRIX 
 {
    union 
 	{