Merge pull request #248 from nillerusr/mathlib-optimize

Mathlib optimize
2 years ago · 697a9f34f9
11 changed files with 1522 additions and 1775 deletions
--- a/.gitignore
+++ b/.gitignore
@ -37,5 +37,3 @@ waf3*/
				@@ -37,5 +37,3 @@ waf3*/
 .vscode/
 .depproj/
 source-engine.sln
-hl2/
-
--- a/game/server/hl2/ai_behavior_police.cpp
+++ b/game/server/hl2/ai_behavior_police.cpp
@ -33,6 +33,7 @@ CAI_PolicingBehavior::CAI_PolicingBehavior( void )
				@@ -33,6 +33,7 @@ CAI_PolicingBehavior::CAI_PolicingBehavior( void )
 	m_bEnabled = false;
 	m_nNumWarnings = 0;
 	m_bTargetIsHostile = false;
+	m_hPoliceGoal = NULL;
 }

 //-----------------------------------------------------------------------------
--- a/materialsystem/imaterialsysteminternal.h
+++ b/materialsystem/imaterialsysteminternal.h
@ -31,9 +31,9 @@ public:
				@@ -31,9 +31,9 @@ public:
 	{
 		MEM_ALLOC_CREDIT_( "CMatCallQueue.m_Allocator" );
 #ifdef SWDS
-		m_Allocator.Init( 2*1024, 0, 0, 4 );
+		m_Allocator.Init( 2*1024, 0, 0, 16 );
 #else
-		m_Allocator.Init( IsX360() ? 2*1024*1024 : 8*1024*1024, 64*1024, 256*1024, 4 );
+		m_Allocator.Init( IsX360() ? 2*1024*1024 : 8*1024*1024, 64*1024, 256*1024, 16 );
 #endif
 		m_FunctorFactory.SetAllocator( &m_Allocator );
 		m_pHead = m_pTail = NULL;
--- a/mathlib/mathlib_base.cpp
+++ b/mathlib/mathlib_base.cpp
@ -420,13 +420,6 @@ void MatrixGetColumn( const matrix3x4_t& in, int column, Vector &out )
				@@ -420,13 +420,6 @@ void MatrixGetColumn( const matrix3x4_t& in, int column, Vector &out )
 	out.z = in[2][column];
 }

-void MatrixSetColumn( const Vector &in, int column, matrix3x4_t& out )
-{
-	out[0][column] = in.x;
-	out[1][column] = in.y;
-	out[2][column] = in.z;
-}
-
 void MatrixScaleBy ( const float flScale, matrix3x4_t &out )
 {
 	out[0][0] *= flScale;
@ -1092,57 +1085,6 @@ void SetScaleMatrix( float x, float y, float z, matrix3x4_t &dst )
				@@ -1092,57 +1085,6 @@ void SetScaleMatrix( float x, float y, float z, matrix3x4_t &dst )
 	dst[2][0] = 0.0f;	dst[2][1] = 0.0f;	dst[2][2] = z;		dst[2][3] = 0.0f;
 }

-
-//-----------------------------------------------------------------------------
-// Purpose: Builds the matrix for a counterclockwise rotation about an arbitrary axis.
-//
-//		   | ax2 + (1 - ax2)cosQ		axay(1 - cosQ) - azsinQ		azax(1 - cosQ) + aysinQ |
-// Ra(Q) = | axay(1 - cosQ) + azsinQ	ay2 + (1 - ay2)cosQ			ayaz(1 - cosQ) - axsinQ |
-//		   | azax(1 - cosQ) - aysinQ	ayaz(1 - cosQ) + axsinQ		az2 + (1 - az2)cosQ     |
-//          
-// Input  : mat - 
-//			vAxisOrRot - 
-//			angle - 
-//-----------------------------------------------------------------------------
-void MatrixBuildRotationAboutAxis( const Vector &vAxisOfRot, float angleDegrees, matrix3x4_t &dst )
-{
-	float radians;
-	float axisXSquared;
-	float axisYSquared;
-	float axisZSquared;
-	float fSin;
-	float fCos;
-
-	radians = angleDegrees * ( M_PI / 180.0 );
-	fSin = sin( radians );
-	fCos = cos( radians );
-
-	axisXSquared = vAxisOfRot[0] * vAxisOfRot[0];
-	axisYSquared = vAxisOfRot[1] * vAxisOfRot[1];
-	axisZSquared = vAxisOfRot[2] * vAxisOfRot[2];
-
-	// Column 0:
-	dst[0][0] = axisXSquared + (1 - axisXSquared) * fCos;
-	dst[1][0] = vAxisOfRot[0] * vAxisOfRot[1] * (1 - fCos) + vAxisOfRot[2] * fSin;
-	dst[2][0] = vAxisOfRot[2] * vAxisOfRot[0] * (1 - fCos) - vAxisOfRot[1] * fSin;
-
-	// Column 1:
-	dst[0][1] = vAxisOfRot[0] * vAxisOfRot[1] * (1 - fCos) - vAxisOfRot[2] * fSin;
-	dst[1][1] = axisYSquared + (1 - axisYSquared) * fCos;
-	dst[2][1] = vAxisOfRot[1] * vAxisOfRot[2] * (1 - fCos) + vAxisOfRot[0] * fSin;
-
-	// Column 2:
-	dst[0][2] = vAxisOfRot[2] * vAxisOfRot[0] * (1 - fCos) + vAxisOfRot[1] * fSin;
-	dst[1][2] = vAxisOfRot[1] * vAxisOfRot[2] * (1 - fCos) - vAxisOfRot[0] * fSin;
-	dst[2][2] = axisZSquared + (1 - axisZSquared) * fCos;
-
-	// Column 3:
-	dst[0][3] = 0;
-	dst[1][3] = 0;
-	dst[2][3] = 0;
-}
-
-
 //-----------------------------------------------------------------------------
 // Computes the transpose
 //-----------------------------------------------------------------------------
@ -1450,33 +1392,6 @@ void VectorYawRotate( const Vector &in, float flYaw, Vector &out)
				@@ -1450,33 +1392,6 @@ void VectorYawRotate( const Vector &in, float flYaw, Vector &out)
 	out.z = in.z;
 }

-
-
-float Bias( float x, float biasAmt )
-{
-	// WARNING: not thread safe
-	static float lastAmt = -1;
-	static float lastExponent = 0;
-	if( lastAmt != biasAmt )
-	{
-		lastExponent = log( biasAmt ) * -1.4427f; // (-1.4427 = 1 / log(0.5))
-	}
-	float fRet = pow( x, lastExponent );
-	Assert ( !IS_NAN( fRet ) );
-	return fRet;
-}
-
-
-float Gain( float x, float biasAmt )
-{
-	// WARNING: not thread safe
-	if( x < 0.5 )
-		return 0.5f * Bias( 2*x, 1-biasAmt );
-	else
-		return 1 - 0.5f * Bias( 2 - 2*x, 1-biasAmt );
-}
-
-
 float SmoothCurve( float x )
 {
 	// Actual smooth curve. Visualization:
--- a/mathlib/vmatrix.cpp
+++ b/mathlib/vmatrix.cpp
--- a/public/mathlib/math_pfns.h
+++ b/public/mathlib/math_pfns.h
@ -22,10 +22,16 @@ extern float (*pfFastCos)(float x);
				@@ -22,10 +22,16 @@ extern float (*pfFastCos)(float x);

 // The following are not declared as macros because they are often used in limiting situations,
 // and sometimes the compiler simply refuses to inline them for some reason
-#define FastSqrt(x)			(*pfSqrt)(x)
-#define	FastRSqrt(x)		(*pfRSqrt)(x)
-#define FastRSqrtFast(x)    (*pfRSqrtFast)(x)
+#define FastSqrt(x)			sqrtf(x)
+#define	FastRSqrt(x)		(1.f/sqrtf(x))
+#define FastRSqrtFast(x)    (1.f/sqrtf(x))
+
+#ifdef _WIN32
 #define FastSinCos(x,s,c)   (*pfFastSinCos)(x,s,c)
+#else
+#define FastSinCos(x,s,c)   sincosf(x,s,c)
+#endif
+
 #define FastCos(x)			(*pfFastCos)(x)

 #if defined(__i386__) || defined(_M_IX86)
--- a/public/mathlib/mathlib.h
+++ b/public/mathlib/mathlib.h
@ -30,7 +30,6 @@
				@@ -30,7 +30,6 @@
 // FP exception clean so this not a turnkey operation.
 //#define FP_EXCEPTIONS_ENABLED

-
 #ifdef FP_EXCEPTIONS_ENABLED
 #include <float.h> // For _clearfp and _controlfp_s
 #endif
@ -93,37 +92,11 @@ private:
				@@ -93,37 +92,11 @@ private:
 	FPExceptionEnabler& operator=(const FPExceptionEnabler&);
 };

-
-
-#ifdef DEBUG  // stop crashing edit-and-continue
-FORCEINLINE float clamp( float val, float minVal, float maxVal )
-{
-	if ( maxVal < minVal )
-		return maxVal;
-	else if( val < minVal )
-		return minVal;
-	else if( val > maxVal )
-		return maxVal;
-	else
-		return val;
-}
-#else // DEBUG
-FORCEINLINE float clamp( float val, float minVal, float maxVal )
+inline float clamp( const float val, const float minVal, const float maxVal )
 {
-#if defined(__i386__) || defined(_M_IX86)
-	_mm_store_ss( &val,
-		_mm_min_ss(
-			_mm_max_ss(
-				_mm_load_ss(&val),
-				_mm_load_ss(&minVal) ),
-			_mm_load_ss(&maxVal) ) );
-#else
-	val = fpmax(minVal, val);
-	val = fpmin(maxVal, val);
-#endif
-	return val;
+	const float t = val < minVal ? minVal : val;
+	return t > maxVal ? maxVal : t;
 }
-#endif // DEBUG

 //
 // Returns a clamped value in the range [min, max].
@ -131,17 +104,10 @@ FORCEINLINE float clamp( float val, float minVal, float maxVal )
				@@ -131,17 +104,10 @@ FORCEINLINE float clamp( float val, float minVal, float maxVal )
 template< class T >
 inline T clamp( T const &val, T const &minVal, T const &maxVal )
 {
-	if ( maxVal < minVal )
-		return maxVal;
-	else if( val < minVal )
-		return minVal;
-	else if( val > maxVal )
-		return maxVal;
-	else
-		return val;
+	const T t = val< minVal ? minVal : val;
+	return t > maxVal ? maxVal : t;
 }

-
 // plane_t structure
 // !!! if this is changed, it must be changed in asm code too !!!
 // FIXME: does the asm code even exist anymore?
@ -237,8 +203,8 @@ bool R_CullBoxSkipNear( const Vector& mins, const Vector& maxs, const Frustum_t
				@@ -237,8 +203,8 @@ bool R_CullBoxSkipNear( const Vector& mins, const Vector& maxs, const Frustum_t

 struct matrix3x4_t
 {
-	matrix3x4_t() = default;
-	matrix3x4_t( 
+	inline matrix3x4_t() = default;
+	inline matrix3x4_t(
 		float m00, float m01, float m02, float m03,
 		float m10, float m11, float m12, float m13,
 		float m20, float m21, float m22, float m23 )
@ -252,7 +218,7 @@ struct matrix3x4_t
				@@ -252,7 +218,7 @@ struct matrix3x4_t
 	// Creates a matrix where the X axis = forward
 	// the Y axis = left, and the Z axis = up
 	//-----------------------------------------------------------------------------
-	void Init( const Vector& xAxis, const Vector& yAxis, const Vector& zAxis, const Vector &vecOrigin )
+	inline void Init( const Vector& xAxis, const Vector& yAxis, const Vector& zAxis, const Vector &vecOrigin )
 	{
 		m_flMatVal[0][0] = xAxis.x; m_flMatVal[0][1] = yAxis.x; m_flMatVal[0][2] = zAxis.x; m_flMatVal[0][3] = vecOrigin.x;
 		m_flMatVal[1][0] = xAxis.y; m_flMatVal[1][1] = yAxis.y; m_flMatVal[1][2] = zAxis.y; m_flMatVal[1][3] = vecOrigin.y;
@ -263,26 +229,23 @@ struct matrix3x4_t
				@@ -263,26 +229,23 @@ struct matrix3x4_t
 	// Creates a matrix where the X axis = forward
 	// the Y axis = left, and the Z axis = up
 	//-----------------------------------------------------------------------------
-	matrix3x4_t( const Vector& xAxis, const Vector& yAxis, const Vector& zAxis, const Vector &vecOrigin )
+	inline matrix3x4_t( const Vector& xAxis, const Vector& yAxis, const Vector& zAxis, const Vector &vecOrigin )
 	{
 		Init( xAxis, yAxis, zAxis, vecOrigin );
 	}

 	inline void Invalidate( void )
 	{
-		for (int i = 0; i < 3; i++)
+		for( int i=0; i < 12; i++ )
 		{
-			for (int j = 0; j < 4; j++)
-			{
-				m_flMatVal[i][j] = VEC_T_NAN;
-			}
+			((float*)m_flMatVal)[i] = VEC_T_NAN;
 		}
 	}

-	float *operator[]( int i )				{ Assert(( i >= 0 ) && ( i < 3 )); return m_flMatVal[i]; }
-	const float *operator[]( int i ) const	{ Assert(( i >= 0 ) && ( i < 3 )); return m_flMatVal[i]; }
-	float *Base()							{ return &m_flMatVal[0][0]; }
-	const float *Base() const				{ return &m_flMatVal[0][0]; }
+	inline float *operator[]( int i )				{ Assert(( i >= 0 ) && ( i < 3 )); return m_flMatVal[i]; }
+	inline const float *operator[]( int i ) const	{ Assert(( i >= 0 ) && ( i < 3 )); return m_flMatVal[i]; }
+	inline float *Base()							{ return &m_flMatVal[0][0]; }
+	inline const float *Base() const				{ return &m_flMatVal[0][0]; }

 	float m_flMatVal[3][4];
 };
@ -565,7 +528,13 @@ void MatrixInvert( const matrix3x4_t &in, matrix3x4_t &out );
				@@ -565,7 +528,13 @@ void MatrixInvert( const matrix3x4_t &in, matrix3x4_t &out );
 bool MatricesAreEqual( const matrix3x4_t &src1, const matrix3x4_t &src2, float flTolerance = 1e-5 );

 void MatrixGetColumn( const matrix3x4_t &in, int column, Vector &out );
-void MatrixSetColumn( const Vector &in, int column, matrix3x4_t &out );
+
+inline void MatrixSetColumn( const Vector &in, int column, matrix3x4_t& out )
+{
+	out[0][column] = in.x;
+	out[1][column] = in.y;
+	out[2][column] = in.z;
+}

 inline void MatrixGetTranslation( const matrix3x4_t &in, Vector &out )
 {
@ -1079,7 +1048,19 @@ void VectorYawRotate( const Vector& in, float flYaw, Vector &out);
				@@ -1079,7 +1048,19 @@ void VectorYawRotate( const Vector& in, float flYaw, Vector &out);
 // 0                   1
 //
 // With a biasAmt of 0.5, Bias returns X.
-float Bias( float x, float biasAmt );
+inline float Bias( float x, float biasAmt )
+{
+	// WARNING: not thread safe
+	static float lastAmt = -1;
+	static float lastExponent = 0;
+	if( lastAmt != biasAmt )
+	{
+		lastExponent = log( biasAmt ) * -1.4427f; // (-1.4427 = 1 / log(0.5))
+	}
+	float fRet = pow( x, lastExponent );
+	Assert ( !IS_NAN( fRet ) );
+	return fRet;
+}


 // Gain is similar to Bias, but biasAmt biases towards or away from 0.5.
@ -1111,9 +1092,14 @@ float Bias( float x, float biasAmt );
				@@ -1111,9 +1092,14 @@ float Bias( float x, float biasAmt );
 // |*****
 // |___________________
 // 0                   1
-float Gain( float x, float biasAmt );
-
-
+inline float Gain( float x, float biasAmt )
+{
+	// WARNING: not thread safe
+	if( x < 0.5 )
+		return 0.5f * Bias( 2*x, 1-biasAmt );
+	else
+		return 1 - 0.5f * Bias( 2 - 2*x, 1-biasAmt );
+}
 // SmoothCurve maps a 0-1 value into another 0-1 value based on a cosine wave
 // where the derivatives of the function at 0 and 1 (and 0.5) are 0. This is useful for
 // any fadein/fadeout effect where it should start and end smoothly.
--- a/public/mathlib/vector4d.h
+++ b/public/mathlib/vector4d.h
@ -35,7 +35,7 @@ class Vector2D;
				@@ -35,7 +35,7 @@ class Vector2D;
 // 4D Vector4D
 //=========================================================

-class Vector4D					
+class alignas(16) Vector4D					
 {
 public:
 	// Members
--- a/public/mathlib/vmatrix.h
+++ b/public/mathlib/vmatrix.h
--- a/public/togl/linuxwin/dxabstract_types.h
+++ b/public/togl/linuxwin/dxabstract_types.h
@ -1042,7 +1042,7 @@ typedef enum _D3DSHADER_PARAM_REGISTER_TYPE
				@@ -1042,7 +1042,7 @@ typedef enum _D3DSHADER_PARAM_REGISTER_TYPE
    D3DSPR_FORCE_DWORD  = 0x7fffffff,         // force 32-bit size enum
 } D3DSHADER_PARAM_REGISTER_TYPE;

-struct D3DMATRIX 
+struct alignas(16) D3DMATRIX 
 {
    union 
 	{
--- a/public/togles/linuxwin/dxabstract_types.h
+++ b/public/togles/linuxwin/dxabstract_types.h
@ -1042,7 +1042,7 @@ typedef enum _D3DSHADER_PARAM_REGISTER_TYPE
				@@ -1042,7 +1042,7 @@ typedef enum _D3DSHADER_PARAM_REGISTER_TYPE
    D3DSPR_FORCE_DWORD  = 0x7fffffff,         // force 32-bit size enum
 } D3DSHADER_PARAM_REGISTER_TYPE;

-struct D3DMATRIX 
+struct alignas(16) D3DMATRIX 
 {
    union 
 	{