materialsystem: threaded optimizations, fix mat_queue_mode on some android devices

2 years ago · a15db00bc0
8 changed files with 66 additions and 140 deletions
--- a/materialsystem/cmaterialsystem.cpp
+++ b/materialsystem/cmaterialsystem.cpp
@ -1029,7 +1029,7 @@ bool CMaterialSystem::AllowThreading( bool bAllow, int nServiceThread )
				@@ -1029,7 +1029,7 @@ bool CMaterialSystem::AllowThreading( bool bAllow, int nServiceThread )

 	bool bOldAllow = m_bAllowQueuedRendering;

-	if ( GetCPUInformation()->m_nPhysicalProcessors >= 2 )
+	if ( GetCPUInformation()->m_nLogicalProcessors >= 2 )
 	{
 		m_bAllowQueuedRendering = bAllow;
 		bool bQueued = m_IdealThreadMode != MATERIAL_SINGLE_THREADED;
@ -1806,11 +1806,7 @@ static ConVar mat_normalmaps(		"mat_normalmaps", "0", FCVAR_CHEAT );
				@@ -1806,11 +1806,7 @@ static ConVar mat_normalmaps(		"mat_normalmaps", "0", FCVAR_CHEAT );
 static ConVar mat_measurefillrate(	"mat_measurefillrate", "0", FCVAR_CHEAT );
 static ConVar mat_fillrate(			"mat_fillrate", "0", FCVAR_CHEAT );
 static ConVar mat_reversedepth(		"mat_reversedepth", "0", FCVAR_CHEAT );
-#ifdef DX_TO_GL_ABSTRACTION
-static ConVar mat_bufferprimitives( "mat_bufferprimitives", "0" );	// I'm not seeing any benefit speed wise for buffered primitives on GLM/POSIX (checked via TF2 timedemo) - default to zero
-#else
 static ConVar mat_bufferprimitives( "mat_bufferprimitives", "1" );
-#endif
 static ConVar mat_drawflat(			"mat_drawflat","0", FCVAR_CHEAT );
 static ConVar mat_softwarelighting( "mat_softwarelighting", "0", FCVAR_ALLOWED_IN_COMPETITIVE );
 static ConVar mat_proxy(			"mat_proxy", "0", FCVAR_CHEAT, "", MatProxyCallback );
@ -2780,8 +2776,8 @@ IMaterial* CMaterialSystem::FindMaterialEx( char const* pMaterialName, const cha
				@@ -2780,8 +2776,8 @@ IMaterial* CMaterialSystem::FindMaterialEx( char const* pMaterialName, const cha
 {
 	// We need lower-case symbols for this to work
 	int nLen = Q_strlen( pMaterialName ) + 1;
-	char *pFixedNameTemp = (char*)malloc( nLen );
-	char *pTemp = (char*)malloc( nLen );
+	char *pFixedNameTemp = (char*)stackalloc( nLen );
+	char *pTemp = (char*)stackalloc( nLen );
 	Q_strncpy( pFixedNameTemp, pMaterialName, nLen );
 	Q_strlower( pFixedNameTemp );
 #ifdef POSIX
@ -2883,9 +2879,6 @@ IMaterial* CMaterialSystem::FindMaterialEx( char const* pMaterialName, const cha
				@@ -2883,9 +2879,6 @@ IMaterial* CMaterialSystem::FindMaterialEx( char const* pMaterialName, const cha
 		}
 	}

-	free(pTemp);
-	free(pFixedNameTemp);
-
 	return g_pErrorMaterial->GetRealTimeVersion();
 }

@ -3103,20 +3096,12 @@ void CMaterialSystem::ResetTempHWMemory( bool bExitingLevel )
				@@ -3103,20 +3096,12 @@ void CMaterialSystem::ResetTempHWMemory( bool bExitingLevel )
 //-----------------------------------------------------------------------------
 void CMaterialSystem::CacheUsedMaterials( )
 {
+	printf("Cache materials\n");
+
 	g_pShaderAPI->EvictManagedResources();
-	size_t count = 0;
+
 	for (MaterialHandle_t i = FirstMaterial(); i != InvalidMaterial(); i = NextMaterial(i) )
 	{
-		// Some (mac) drivers (amd) seem to keep extra resources around on uploads until the next frame swap.  This
-		// injects pointless synthetic swaps (between already-static load frames)
-		if ( mat_texture_reload_frame_swap_workaround.GetBool() )
-		{
-			if ( count++ % 20 == 0 )
-			{
-				Flush(true);
-				SwapBuffers(); // Not the right thing to call
-			}
-		}
 		IMaterialInternal* pMat = GetMaterialInternal(i);
 		Assert( pMat->GetReferenceCount() >= 0 );
 		if( pMat->GetReferenceCount() > 0 )
@ -3703,9 +3688,13 @@ void CMaterialSystem::EndFrame( void )
				@@ -3703,9 +3688,13 @@ void CMaterialSystem::EndFrame( void )
 				ThreadAcquire( true );
 			}

+			IThreadPool* pThreadPool = CreateMatQueueThreadPool();
+
 			if ( m_pActiveAsyncJob && !m_pActiveAsyncJob->IsFinished() )
 			{
-				m_pActiveAsyncJob->WaitForFinish();
+				m_pActiveAsyncJob->WaitForFinish(TT_INFINITE, pThreadPool);
+
+				// Sync with GPU if we had a job for it, even if it finished early on CPU!
 				if ( !IsPC() && g_config.ForceHWSync() )
 				{
 					g_pShaderAPI->ForceHardwareSync();
@ -3730,7 +3719,6 @@ void CMaterialSystem::EndFrame( void )
				@@ -3730,7 +3719,6 @@ void CMaterialSystem::EndFrame( void )
 				}
 			}

-			IThreadPool *pThreadPool = CreateMatQueueThreadPool();
 			pThreadPool->AddJob( m_pActiveAsyncJob );
 			break;
 		}
@ -4664,20 +4652,9 @@ void CMaterialSystem::BeginRenderTargetAllocation( void )
				@@ -4664,20 +4652,9 @@ void CMaterialSystem::BeginRenderTargetAllocation( void )

 void CMaterialSystem::EndRenderTargetAllocation( void )
 {
-	// Any GPU newer than 2005 doesn't need to do this, and it eats up ~40% of our level load time! 
-	const bool cbRequiresRenderTargetAllocationFirst = mat_requires_rt_alloc_first.GetBool();
-
 	g_pShaderAPI->FlushBufferedPrimitives();
 	m_bAllocatingRenderTargets = false;

-	if ( IsPC() && cbRequiresRenderTargetAllocationFirst && g_pShaderAPI->CanDownloadTextures() )
-	{
-		// Simulate an Alt-Tab...will cause RTs to be allocated first
-
-		g_pShaderDevice->ReleaseResources();
-		g_pShaderDevice->ReacquireResources();
-	}
-
 	TextureManager()->CacheExternalStandardRenderTargets();
 }

--- a/materialsystem/cmatqueuedrendercontext.cpp
+++ b/materialsystem/cmatqueuedrendercontext.cpp
@ -455,14 +455,11 @@ public:
				@@ -455,14 +455,11 @@ public:
 			}
 			else
 			{
-				ALIGN16 uint16 tempIndices[16];
+				static ALIGN16 uint16 tempIndices[256];

+				// original method
 				int i = 0;
-				if ( (size_t)desc.m_pIndices % 4 == 2 )
-				{
-					desc.m_pIndices[i] = pIndexData[i] + desc.m_nFirstVertex;
-					i++;
-				}
+
 				while ( i < nIndices )
 				{
 					int nToCopy = min( (int)ARRAYSIZE(tempIndices), nIndices - i );
--- a/materialsystem/ctexture.cpp
+++ b/materialsystem/ctexture.cpp
@ -2458,13 +2458,6 @@ bool CTexture::AsyncReadTextureFromFile( IVTFTexture* pVTFTexture, unsigned int
				@@ -2458,13 +2458,6 @@ bool CTexture::AsyncReadTextureFromFile( IVTFTexture* pVTFTexture, unsigned int
 		return false;
 	}

-	if ( V_strstr( GetName(), "c_sniperrifle_scope" ) )
-	{
-		int i = 0;
-		i = 3;
-	}
-
-
 	tmZone( TELEMETRY_LEVEL0, TMZF_NONE, "%s - %s", __FUNCTION__, tmDynamicString( TELEMETRY_LEVEL0, pCacheFileName ) );

 	// OSX hackery
@ -4189,12 +4182,6 @@ bool SLoadTextureBitsFromFile( IVTFTexture **ppOutVtfTexture, FileHandle_t hFile
				@@ -4189,12 +4182,6 @@ bool SLoadTextureBitsFromFile( IVTFTexture **ppOutVtfTexture, FileHandle_t hFile
 	// NOTE! NOTE! NOTE! or by the streaming texture code!
 	Assert( ppOutVtfTexture != NULL && *ppOutVtfTexture != NULL );

-	if ( V_strstr( pName, "c_rocketlauncher/c_rocketlauncher" ) )
-	{
-		int i = 0;
-		i = 3;
-	}
-
 	CUtlBuffer buf;

 	{
--- a/public/tier0/threadtools.h
+++ b/public/tier0/threadtools.h
@ -52,6 +52,12 @@
				@@ -52,6 +52,12 @@
 #pragma once
 #pragma warning(push)
 #pragma warning(disable:4251)
+
+extern "C"
+{
+	void __declspec(dllimport) __stdcall Sleep( unsigned long );
+}
+
 #endif

 #ifdef COMPILER_MSVC64
@ -194,8 +200,6 @@ PLATFORM_INTERFACE bool ReleaseThreadHandle( ThreadHandle_t );
				@@ -194,8 +200,6 @@ PLATFORM_INTERFACE bool ReleaseThreadHandle( ThreadHandle_t );

 //-----------------------------------------------------------------------------

-PLATFORM_INTERFACE void ThreadSleep(unsigned duration = 0);
-PLATFORM_INTERFACE void ThreadNanoSleep(unsigned ns);
 PLATFORM_INTERFACE ThreadId_t ThreadGetCurrentId();
 PLATFORM_INTERFACE ThreadHandle_t ThreadGetCurrentHandle();
 PLATFORM_INTERFACE int ThreadGetPriority( ThreadHandle_t hThread = NULL );
@ -229,10 +233,10 @@ inline void ThreadPause()
				@@ -229,10 +233,10 @@ inline void ThreadPause()
 {
 #if defined( COMPILER_PS3 )
 	__db16cyc();
-#elif defined(__arm__) || defined(__aarch64__)
-        sched_yield();
-#elif defined( COMPILER_GCC )
+#elif defined( COMPILER_GCC ) && (defined( __i386__ ) || defined( __x86_64__ ))
 	__asm __volatile( "pause" );
+#elif defined( POSIX )
+        sched_yield();
 #elif defined ( COMPILER_MSVC64 )
 	_mm_pause();
 #elif defined( COMPILER_MSVC32 )
@ -247,6 +251,36 @@ inline void ThreadPause()
				@@ -247,6 +251,36 @@ inline void ThreadPause()
 #endif
 }

+inline void ThreadSleep(unsigned nMilliseconds = 0)
+{
+	if( nMilliseconds == 0 )
+	{
+		ThreadPause();
+		return;
+        }
+
+#ifdef _WIN32
+
+#ifdef _WIN32_PC
+        static bool bInitialized = false;
+        if ( !bInitialized )
+        {
+                bInitialized = true;
+                // Set the timer resolution to 1 ms (default is 10.0, 15.6, 2.5, 1.0 or
+                // some other value depending on hardware and software) so that we can
+                // use Sleep( 1 ) to avoid wasting CPU time without missing our frame
+                // rate.
+                timeBeginPeriod( 1 );
+        }
+#endif
+	Sleep( nMilliseconds );
+#elif PS3
+	sys_timer_usleep( nMilliseconds * 1000 );
+#elif defined(POSIX)
+        usleep( nMilliseconds * 1000 );
+#endif
+}
+
 PLATFORM_INTERFACE bool ThreadJoin( ThreadHandle_t, unsigned timeout = TT_INFINITE );

 PLATFORM_INTERFACE void ThreadSetDebugName( ThreadHandle_t hThread, const char *pszName );
--- a/public/tier1/memhelpers.h
+++ b/public/tier1/memhelpers.h
@ -11,21 +11,15 @@ namespace memutils
				@@ -11,21 +11,15 @@ namespace memutils
 	template<typename T>
 	inline void copy( T *dest, const T *src, size_t n )
 	{
-		do
-		{
-			--n;
-			*(dest+n) = *(src+n);
-	        } while( n );
+		for(; n; n--)
+			*(dest++) = *(src++);
 	}

 	template<typename T>
-	inline void set( T *dest, T value, size_t n )
+	inline void set( T *dest, const T& value, size_t n )
 	{
-		do
-		{
-			--n;
-			*(dest+n) = value;
-		} while( n );
+		for(; n; n--)
+			*(dest++) = value;
 	}
 }

--- a/public/vstdlib/jobthread.h
+++ b/public/vstdlib/jobthread.h
@ -492,8 +492,8 @@ public:
				@@ -492,8 +492,8 @@ public:
 	//-----------------------------------------------------
 	// Thread event support (safe for NULL this to simplify code )
 	//-----------------------------------------------------
-	bool WaitForFinish( uint32 dwTimeout = TT_INFINITE ) { if (!this) return true; return ( !IsFinished() ) ? g_pThreadPool->YieldWait( this, dwTimeout ) : true; }
-	bool WaitForFinishAndRelease( uint32 dwTimeout = TT_INFINITE ) { if (!this) return true; bool bResult = WaitForFinish( dwTimeout); Release(); return bResult; }
+	inline bool WaitForFinish( uint32 dwTimeout = TT_INFINITE, IThreadPool *pool = g_pThreadPool ) { if (!this) return true; return ( !IsFinished() ) ? pool->YieldWait( this, dwTimeout ) : true; }
+	inline bool WaitForFinishAndRelease( uint32 dwTimeout = TT_INFINITE ) { if (!this) return true; bool bResult = WaitForFinish( dwTimeout); Release(); return bResult; }
 	CThreadEvent *AccessEvent()						{ return &m_CompleteEvent; }

 	//-----------------------------------------------------
--- a/tier0/threadtools.cpp
+++ b/tier0/threadtools.cpp
@ -485,59 +485,6 @@ bool ReleaseThreadHandle( ThreadHandle_t hThread )
				@@ -485,59 +485,6 @@ bool ReleaseThreadHandle( ThreadHandle_t hThread )
 //
 //-----------------------------------------------------------------------------

-void ThreadSleep(unsigned nMilliseconds)
-{
-#ifdef _WIN32
-
-#ifdef _WIN32_PC
-	static bool bInitialized = false;
-	if ( !bInitialized )
-	{
-		bInitialized = true;
-		// Set the timer resolution to 1 ms (default is 10.0, 15.6, 2.5, 1.0 or
-		// some other value depending on hardware and software) so that we can
-		// use Sleep( 1 ) to avoid wasting CPU time without missing our frame
-		// rate.
-		timeBeginPeriod( 1 );
-	}
-#endif
-
-	Sleep( nMilliseconds );
-#elif PS3
-	if( nMilliseconds == 0 )
-	{
-		// sys_ppu_thread_yield doesn't seem to function properly, so sleep instead.
-//		sys_timer_usleep( 60 );
-		sys_ppu_thread_yield();
-	}
-	else
-	{
-		sys_timer_usleep( nMilliseconds * 1000 );
-	}
-#elif defined(POSIX)
-   usleep( nMilliseconds * 1000 ); 
-#endif
-}
-
-//-----------------------------------------------------------------------------
-void ThreadNanoSleep(unsigned ns)
-{
-#ifdef _WIN32
-	// ceil
-	Sleep( ( ns + 999 ) / 1000 );
-#elif PS3
-	sys_timer_usleep( ns );
-#elif defined(POSIX)
-	struct timespec tm;
-	tm.tv_sec = 0;
-	tm.tv_nsec = ns;
-	nanosleep( &tm, NULL ); 
-#endif
-}
-
-
-//-----------------------------------------------------------------------------
-
 #ifndef ThreadGetCurrentId
 ThreadId_t ThreadGetCurrentId()
 {
--- a/vstdlib/jobthread.cpp
+++ b/vstdlib/jobthread.cpp
@ -214,7 +214,11 @@ public:
				@@ -214,7 +214,11 @@ public:
 	//-----------------------------------------------------
 	virtual int YieldWait( CThreadEvent **pEvents, int nEvents, bool bWaitAll = true, unsigned timeout = TT_INFINITE );
 	virtual int YieldWait( CJob **, int nJobs, bool bWaitAll = true, unsigned timeout = TT_INFINITE );
-	void Yield( unsigned timeout );
+	inline void Yield( unsigned timeout )
+	{
+		Assert( ThreadInMainThread() );
+		ThreadSleep( timeout );
+	}

 	//-----------------------------------------------------
 	// Add a native job to the queue (master thread)
@ -656,20 +660,6 @@ int CThreadPool::YieldWait( CJob **ppJobs, int nJobs, bool bWaitAll, unsigned ti
				@@ -656,20 +660,6 @@ int CThreadPool::YieldWait( CJob **ppJobs, int nJobs, bool bWaitAll, unsigned ti
 	return YieldWait( handles.Base(), handles.Count(), bWaitAll, timeout);
 }

-//---------------------------------------------------------
-
-void CThreadPool::Yield( unsigned timeout )
-{
-	// @MULTICORE (toml 10/24/2006): not implemented
-	Assert( ThreadInMainThread() );
-	if ( !ThreadInMainThread() )
-	{
-		ThreadSleep( timeout );
-		return;
-	}
-	ThreadSleep( timeout );
-}
-
 //---------------------------------------------------------
 // Add a job to the queue
 //---------------------------------------------------------