Browse Source

materialsystem: threaded optimizations, fix mat_queue_mode on some android devices

pull/174/head
nillerusr 1 year ago
parent
commit
a15db00bc0
  1. 45
      materialsystem/cmaterialsystem.cpp
  2. 9
      materialsystem/cmatqueuedrendercontext.cpp
  3. 15
      materialsystem/ctexture.cpp
  4. 44
      public/tier0/threadtools.h
  5. 16
      public/tier1/memhelpers.h
  6. 4
      public/vstdlib/jobthread.h
  7. 53
      tier0/threadtools.cpp
  8. 20
      vstdlib/jobthread.cpp

45
materialsystem/cmaterialsystem.cpp

@ -1029,7 +1029,7 @@ bool CMaterialSystem::AllowThreading( bool bAllow, int nServiceThread ) @@ -1029,7 +1029,7 @@ bool CMaterialSystem::AllowThreading( bool bAllow, int nServiceThread )
bool bOldAllow = m_bAllowQueuedRendering;
if ( GetCPUInformation()->m_nPhysicalProcessors >= 2 )
if ( GetCPUInformation()->m_nLogicalProcessors >= 2 )
{
m_bAllowQueuedRendering = bAllow;
bool bQueued = m_IdealThreadMode != MATERIAL_SINGLE_THREADED;
@ -1806,11 +1806,7 @@ static ConVar mat_normalmaps( "mat_normalmaps", "0", FCVAR_CHEAT ); @@ -1806,11 +1806,7 @@ static ConVar mat_normalmaps( "mat_normalmaps", "0", FCVAR_CHEAT );
static ConVar mat_measurefillrate( "mat_measurefillrate", "0", FCVAR_CHEAT );
static ConVar mat_fillrate( "mat_fillrate", "0", FCVAR_CHEAT );
static ConVar mat_reversedepth( "mat_reversedepth", "0", FCVAR_CHEAT );
#ifdef DX_TO_GL_ABSTRACTION
static ConVar mat_bufferprimitives( "mat_bufferprimitives", "0" ); // I'm not seeing any benefit speed wise for buffered primitives on GLM/POSIX (checked via TF2 timedemo) - default to zero
#else
static ConVar mat_bufferprimitives( "mat_bufferprimitives", "1" );
#endif
static ConVar mat_drawflat( "mat_drawflat","0", FCVAR_CHEAT );
static ConVar mat_softwarelighting( "mat_softwarelighting", "0", FCVAR_ALLOWED_IN_COMPETITIVE );
static ConVar mat_proxy( "mat_proxy", "0", FCVAR_CHEAT, "", MatProxyCallback );
@ -2780,8 +2776,8 @@ IMaterial* CMaterialSystem::FindMaterialEx( char const* pMaterialName, const cha @@ -2780,8 +2776,8 @@ IMaterial* CMaterialSystem::FindMaterialEx( char const* pMaterialName, const cha
{
// We need lower-case symbols for this to work
int nLen = Q_strlen( pMaterialName ) + 1;
char *pFixedNameTemp = (char*)malloc( nLen );
char *pTemp = (char*)malloc( nLen );
char *pFixedNameTemp = (char*)stackalloc( nLen );
char *pTemp = (char*)stackalloc( nLen );
Q_strncpy( pFixedNameTemp, pMaterialName, nLen );
Q_strlower( pFixedNameTemp );
#ifdef POSIX
@ -2883,9 +2879,6 @@ IMaterial* CMaterialSystem::FindMaterialEx( char const* pMaterialName, const cha @@ -2883,9 +2879,6 @@ IMaterial* CMaterialSystem::FindMaterialEx( char const* pMaterialName, const cha
}
}
free(pTemp);
free(pFixedNameTemp);
return g_pErrorMaterial->GetRealTimeVersion();
}
@ -3103,20 +3096,12 @@ void CMaterialSystem::ResetTempHWMemory( bool bExitingLevel ) @@ -3103,20 +3096,12 @@ void CMaterialSystem::ResetTempHWMemory( bool bExitingLevel )
//-----------------------------------------------------------------------------
void CMaterialSystem::CacheUsedMaterials( )
{
printf("Cache materials\n");
g_pShaderAPI->EvictManagedResources();
size_t count = 0;
for (MaterialHandle_t i = FirstMaterial(); i != InvalidMaterial(); i = NextMaterial(i) )
{
// Some (mac) drivers (amd) seem to keep extra resources around on uploads until the next frame swap. This
// injects pointless synthetic swaps (between already-static load frames)
if ( mat_texture_reload_frame_swap_workaround.GetBool() )
{
if ( count++ % 20 == 0 )
{
Flush(true);
SwapBuffers(); // Not the right thing to call
}
}
IMaterialInternal* pMat = GetMaterialInternal(i);
Assert( pMat->GetReferenceCount() >= 0 );
if( pMat->GetReferenceCount() > 0 )
@ -3703,9 +3688,13 @@ void CMaterialSystem::EndFrame( void ) @@ -3703,9 +3688,13 @@ void CMaterialSystem::EndFrame( void )
ThreadAcquire( true );
}
IThreadPool* pThreadPool = CreateMatQueueThreadPool();
if ( m_pActiveAsyncJob && !m_pActiveAsyncJob->IsFinished() )
{
m_pActiveAsyncJob->WaitForFinish();
m_pActiveAsyncJob->WaitForFinish(TT_INFINITE, pThreadPool);
// Sync with GPU if we had a job for it, even if it finished early on CPU!
if ( !IsPC() && g_config.ForceHWSync() )
{
g_pShaderAPI->ForceHardwareSync();
@ -3730,7 +3719,6 @@ void CMaterialSystem::EndFrame( void ) @@ -3730,7 +3719,6 @@ void CMaterialSystem::EndFrame( void )
}
}
IThreadPool *pThreadPool = CreateMatQueueThreadPool();
pThreadPool->AddJob( m_pActiveAsyncJob );
break;
}
@ -4664,20 +4652,9 @@ void CMaterialSystem::BeginRenderTargetAllocation( void ) @@ -4664,20 +4652,9 @@ void CMaterialSystem::BeginRenderTargetAllocation( void )
void CMaterialSystem::EndRenderTargetAllocation( void )
{
// Any GPU newer than 2005 doesn't need to do this, and it eats up ~40% of our level load time!
const bool cbRequiresRenderTargetAllocationFirst = mat_requires_rt_alloc_first.GetBool();
g_pShaderAPI->FlushBufferedPrimitives();
m_bAllocatingRenderTargets = false;
if ( IsPC() && cbRequiresRenderTargetAllocationFirst && g_pShaderAPI->CanDownloadTextures() )
{
// Simulate an Alt-Tab...will cause RTs to be allocated first
g_pShaderDevice->ReleaseResources();
g_pShaderDevice->ReacquireResources();
}
TextureManager()->CacheExternalStandardRenderTargets();
}

9
materialsystem/cmatqueuedrendercontext.cpp

@ -455,14 +455,11 @@ public: @@ -455,14 +455,11 @@ public:
}
else
{
ALIGN16 uint16 tempIndices[16];
static ALIGN16 uint16 tempIndices[256];
// original method
int i = 0;
if ( (size_t)desc.m_pIndices % 4 == 2 )
{
desc.m_pIndices[i] = pIndexData[i] + desc.m_nFirstVertex;
i++;
}
while ( i < nIndices )
{
int nToCopy = min( (int)ARRAYSIZE(tempIndices), nIndices - i );

15
materialsystem/ctexture.cpp

@ -2458,15 +2458,8 @@ bool CTexture::AsyncReadTextureFromFile( IVTFTexture* pVTFTexture, unsigned int @@ -2458,15 +2458,8 @@ bool CTexture::AsyncReadTextureFromFile( IVTFTexture* pVTFTexture, unsigned int
return false;
}
if ( V_strstr( GetName(), "c_sniperrifle_scope" ) )
{
int i = 0;
i = 3;
}
tmZone( TELEMETRY_LEVEL0, TMZF_NONE, "%s - %s", __FUNCTION__, tmDynamicString( TELEMETRY_LEVEL0, pCacheFileName ) );
// OSX hackery
int nPreserveFlags = nAdditionalCreationFlags;
if ( m_nFlags & TEXTUREFLAGS_SRGB )
@ -4189,12 +4182,6 @@ bool SLoadTextureBitsFromFile( IVTFTexture **ppOutVtfTexture, FileHandle_t hFile @@ -4189,12 +4182,6 @@ bool SLoadTextureBitsFromFile( IVTFTexture **ppOutVtfTexture, FileHandle_t hFile
// NOTE! NOTE! NOTE! or by the streaming texture code!
Assert( ppOutVtfTexture != NULL && *ppOutVtfTexture != NULL );
if ( V_strstr( pName, "c_rocketlauncher/c_rocketlauncher" ) )
{
int i = 0;
i = 3;
}
CUtlBuffer buf;
{

44
public/tier0/threadtools.h

@ -52,6 +52,12 @@ @@ -52,6 +52,12 @@
#pragma once
#pragma warning(push)
#pragma warning(disable:4251)
extern "C"
{
void __declspec(dllimport) __stdcall Sleep( unsigned long );
}
#endif
#ifdef COMPILER_MSVC64
@ -194,8 +200,6 @@ PLATFORM_INTERFACE bool ReleaseThreadHandle( ThreadHandle_t ); @@ -194,8 +200,6 @@ PLATFORM_INTERFACE bool ReleaseThreadHandle( ThreadHandle_t );
//-----------------------------------------------------------------------------
PLATFORM_INTERFACE void ThreadSleep(unsigned duration = 0);
PLATFORM_INTERFACE void ThreadNanoSleep(unsigned ns);
PLATFORM_INTERFACE ThreadId_t ThreadGetCurrentId();
PLATFORM_INTERFACE ThreadHandle_t ThreadGetCurrentHandle();
PLATFORM_INTERFACE int ThreadGetPriority( ThreadHandle_t hThread = NULL );
@ -229,10 +233,10 @@ inline void ThreadPause() @@ -229,10 +233,10 @@ inline void ThreadPause()
{
#if defined( COMPILER_PS3 )
__db16cyc();
#elif defined(__arm__) || defined(__aarch64__)
sched_yield();
#elif defined( COMPILER_GCC )
#elif defined( COMPILER_GCC ) && (defined( __i386__ ) || defined( __x86_64__ ))
__asm __volatile( "pause" );
#elif defined( POSIX )
sched_yield();
#elif defined ( COMPILER_MSVC64 )
_mm_pause();
#elif defined( COMPILER_MSVC32 )
@ -247,6 +251,36 @@ inline void ThreadPause() @@ -247,6 +251,36 @@ inline void ThreadPause()
#endif
}
inline void ThreadSleep(unsigned nMilliseconds = 0)
{
if( nMilliseconds == 0 )
{
ThreadPause();
return;
}
#ifdef _WIN32
#ifdef _WIN32_PC
static bool bInitialized = false;
if ( !bInitialized )
{
bInitialized = true;
// Set the timer resolution to 1 ms (default is 10.0, 15.6, 2.5, 1.0 or
// some other value depending on hardware and software) so that we can
// use Sleep( 1 ) to avoid wasting CPU time without missing our frame
// rate.
timeBeginPeriod( 1 );
}
#endif
Sleep( nMilliseconds );
#elif PS3
sys_timer_usleep( nMilliseconds * 1000 );
#elif defined(POSIX)
usleep( nMilliseconds * 1000 );
#endif
}
PLATFORM_INTERFACE bool ThreadJoin( ThreadHandle_t, unsigned timeout = TT_INFINITE );
PLATFORM_INTERFACE void ThreadSetDebugName( ThreadHandle_t hThread, const char *pszName );

16
public/tier1/memhelpers.h

@ -11,21 +11,15 @@ namespace memutils @@ -11,21 +11,15 @@ namespace memutils
template<typename T>
inline void copy( T *dest, const T *src, size_t n )
{
do
{
--n;
*(dest+n) = *(src+n);
} while( n );
for(; n; n--)
*(dest++) = *(src++);
}
template<typename T>
inline void set( T *dest, T value, size_t n )
inline void set( T *dest, const T& value, size_t n )
{
do
{
--n;
*(dest+n) = value;
} while( n );
for(; n; n--)
*(dest++) = value;
}
}

4
public/vstdlib/jobthread.h

@ -492,8 +492,8 @@ public: @@ -492,8 +492,8 @@ public:
//-----------------------------------------------------
// Thread event support (safe for NULL this to simplify code )
//-----------------------------------------------------
bool WaitForFinish( uint32 dwTimeout = TT_INFINITE ) { if (!this) return true; return ( !IsFinished() ) ? g_pThreadPool->YieldWait( this, dwTimeout ) : true; }
bool WaitForFinishAndRelease( uint32 dwTimeout = TT_INFINITE ) { if (!this) return true; bool bResult = WaitForFinish( dwTimeout); Release(); return bResult; }
inline bool WaitForFinish( uint32 dwTimeout = TT_INFINITE, IThreadPool *pool = g_pThreadPool ) { if (!this) return true; return ( !IsFinished() ) ? pool->YieldWait( this, dwTimeout ) : true; }
inline bool WaitForFinishAndRelease( uint32 dwTimeout = TT_INFINITE ) { if (!this) return true; bool bResult = WaitForFinish( dwTimeout); Release(); return bResult; }
CThreadEvent *AccessEvent() { return &m_CompleteEvent; }
//-----------------------------------------------------

53
tier0/threadtools.cpp

@ -485,59 +485,6 @@ bool ReleaseThreadHandle( ThreadHandle_t hThread ) @@ -485,59 +485,6 @@ bool ReleaseThreadHandle( ThreadHandle_t hThread )
//
//-----------------------------------------------------------------------------
void ThreadSleep(unsigned nMilliseconds)
{
#ifdef _WIN32
#ifdef _WIN32_PC
static bool bInitialized = false;
if ( !bInitialized )
{
bInitialized = true;
// Set the timer resolution to 1 ms (default is 10.0, 15.6, 2.5, 1.0 or
// some other value depending on hardware and software) so that we can
// use Sleep( 1 ) to avoid wasting CPU time without missing our frame
// rate.
timeBeginPeriod( 1 );
}
#endif
Sleep( nMilliseconds );
#elif PS3
if( nMilliseconds == 0 )
{
// sys_ppu_thread_yield doesn't seem to function properly, so sleep instead.
// sys_timer_usleep( 60 );
sys_ppu_thread_yield();
}
else
{
sys_timer_usleep( nMilliseconds * 1000 );
}
#elif defined(POSIX)
usleep( nMilliseconds * 1000 );
#endif
}
//-----------------------------------------------------------------------------
void ThreadNanoSleep(unsigned ns)
{
#ifdef _WIN32
// ceil
Sleep( ( ns + 999 ) / 1000 );
#elif PS3
sys_timer_usleep( ns );
#elif defined(POSIX)
struct timespec tm;
tm.tv_sec = 0;
tm.tv_nsec = ns;
nanosleep( &tm, NULL );
#endif
}
//-----------------------------------------------------------------------------
#ifndef ThreadGetCurrentId
ThreadId_t ThreadGetCurrentId()
{

20
vstdlib/jobthread.cpp

@ -214,7 +214,11 @@ public: @@ -214,7 +214,11 @@ public:
//-----------------------------------------------------
virtual int YieldWait( CThreadEvent **pEvents, int nEvents, bool bWaitAll = true, unsigned timeout = TT_INFINITE );
virtual int YieldWait( CJob **, int nJobs, bool bWaitAll = true, unsigned timeout = TT_INFINITE );
void Yield( unsigned timeout );
inline void Yield( unsigned timeout )
{
Assert( ThreadInMainThread() );
ThreadSleep( timeout );
}
//-----------------------------------------------------
// Add a native job to the queue (master thread)
@ -656,20 +660,6 @@ int CThreadPool::YieldWait( CJob **ppJobs, int nJobs, bool bWaitAll, unsigned ti @@ -656,20 +660,6 @@ int CThreadPool::YieldWait( CJob **ppJobs, int nJobs, bool bWaitAll, unsigned ti
return YieldWait( handles.Base(), handles.Count(), bWaitAll, timeout);
}
//---------------------------------------------------------
void CThreadPool::Yield( unsigned timeout )
{
// @MULTICORE (toml 10/24/2006): not implemented
Assert( ThreadInMainThread() );
if ( !ThreadInMainThread() )
{
ThreadSleep( timeout );
return;
}
ThreadSleep( timeout );
}
//---------------------------------------------------------
// Add a job to the queue
//---------------------------------------------------------

Loading…
Cancel
Save