//========= Copyright Valve Corporation, All rights reserved. ============// // // Purpose: // //============================================================================= #include "sfmobjects/SFMPhonemeExtractor.h" #include "tier2/riff.h" #include "PhonemeConverter.h" #include "filesystem.h" #include "tier1/utlbuffer.h" #include "sentence.h" #include "movieobjects/dmesound.h" #include "movieobjects/dmeanimationset.h" #include "movieobjects/dmebookmark.h" #include "movieobjects/dmeclip.h" #include "movieobjects/dmechannel.h" #include "soundchars.h" #include "tier2/p4helpers.h" #include "tier2/soundutils.h" #include "tier1/utldict.h" #include // WAVEFORMATEX, WAVEFORMAT and ADPCM WAVEFORMAT!!! #include // memdbgon must be the last include file in a .cpp file!!! #include "tier0/memdbgon.h" static const char *s_pAttributeValueNames[LOG_PREVIEW_FLEX_CHANNEL_COUNT] = { "value", "balance", "multilevel" }; static const char *s_pDefaultAttributeValueNames[LOG_PREVIEW_FLEX_CHANNEL_COUNT] = { "defaultValue", "defaultBalance", "defaultMultilevel" }; struct Extractor { PE_APITYPE apitype; CSysModule *module; IPhonemeExtractor *extractor; }; //----------------------------------------------------------------------------- // Implementations of the phoneme extractor //----------------------------------------------------------------------------- class CSFMPhonemeExtractor : public ISFMPhonemeExtractor { public: CSFMPhonemeExtractor(); // Inherited from ISFMPhonemeExtractor virtual bool Init(); virtual void Shutdown(); virtual int GetAPICount(); virtual void GetAPIInfo( int index, CUtlString* pPrintName, PE_APITYPE *pAPIType ); virtual void Extract( const PE_APITYPE& apiType, ExtractDesc_t& info, bool bWritePhonemesToWavFiles ); virtual void ReApply( ExtractDesc_t& info ); virtual bool GetSentence( CDmeGameSound *gameSound, CSentence& sentence ); private: int FindExtractor( PE_APITYPE type ); bool GetWaveFormat( const char *filename, CUtlBuffer* pFormat, int *pDataSize, CSentence& sentence, bool &bGotSentence ); void LogPhonemes( int nItemIndex, ExtractDesc_t& info ); void ClearInterstitialSpaces( CDmeChannelsClip *pChannelsClip, CUtlDict< LogPreview_t *, int >& controlLookup, ExtractDesc_t& info ); void StampControlValueLogs( CDmePreset *preset, DmeTime_t tHeadPosition, float flIntensity, CUtlDict< LogPreview_t *, int > &controlLookup ); void WriteCurrentValuesIntoLogLayers( DmeTime_t tHeadPosition, const CUtlDict< LogPreview_t *, int > &controlLookup ); void WriteDefaultValuesIntoLogLayers( DmeTime_t tHeadPosition, const CUtlDict< LogPreview_t *, int > &controlLookup ); void BuildPhonemeLogList( CUtlVector< LogPreview_t > &list, CUtlVector< CDmeLog * > &logs ); CDmeChannelsClip* FindFacialChannelsClip( const CUtlVector< LogPreview_t > &list ); void BuildPhonemeToPresetMapping( const CUtlVector< CBasePhonemeTag * > &stream, CDmeAnimationSet *pSet, CDmePresetGroup * pPresetGroup, CUtlDict< CDmePreset *, unsigned short > &phonemeToPresetDict ); CUtlVector< Extractor > m_Extractors; int m_nCurrentExtractor; }; //----------------------------------------------------------------------------- // Singleton //----------------------------------------------------------------------------- static CSFMPhonemeExtractor g_ExtractorSingleton; ISFMPhonemeExtractor *sfm_phonemeextractor = &g_ExtractorSingleton; //----------------------------------------------------------------------------- // Constructor //----------------------------------------------------------------------------- CSFMPhonemeExtractor::CSFMPhonemeExtractor() : m_nCurrentExtractor( -1 ) { } //----------------------------------------------------------------------------- // Init, shutdown //----------------------------------------------------------------------------- bool CSFMPhonemeExtractor::Init() { // Enumerate modules under bin folder of exe FileFindHandle_t findHandle; const char *pFilename = g_pFullFileSystem->FindFirstEx( "phonemeextractors/*.dll", "EXECUTABLE_PATH", &findHandle ); while( pFilename ) { char fullpath[ 512 ]; Q_snprintf( fullpath, sizeof( fullpath ), "phonemeextractors/%s", pFilename ); // Msg( "Loading extractor from %s\n", fullpath ); Extractor e; e.module = g_pFullFileSystem->LoadModule( fullpath ); if ( !e.module ) { pFilename = g_pFullFileSystem->FindNext( findHandle ); continue; } CreateInterfaceFn factory = Sys_GetFactory( e.module ); if ( !factory ) { pFilename = g_pFullFileSystem->FindNext( findHandle ); continue; } e.extractor = ( IPhonemeExtractor * )factory( VPHONEME_EXTRACTOR_INTERFACE, NULL ); if ( !e.extractor ) { Warning( "Unable to get IPhonemeExtractor interface version %s from %s\n", VPHONEME_EXTRACTOR_INTERFACE, fullpath ); pFilename = g_pFullFileSystem->FindNext( findHandle ); continue; } e.apitype = e.extractor->GetAPIType(); m_Extractors.AddToTail( e ); pFilename = g_pFullFileSystem->FindNext( findHandle ); } g_pFullFileSystem->FindClose( findHandle ); return true; } void CSFMPhonemeExtractor::Shutdown() { int c = m_Extractors.Count(); for ( int i = c - 1; i >= 0; i-- ) { Extractor *e = &m_Extractors[ i ]; g_pFullFileSystem->UnloadModule( e->module ); } m_Extractors.RemoveAll(); } //----------------------------------------------------------------------------- // Finds an extractor of a particular type //----------------------------------------------------------------------------- int CSFMPhonemeExtractor::FindExtractor( PE_APITYPE type ) { for ( int i=0; i < m_Extractors.Count(); i++ ) { if ( m_Extractors[i].apitype == type ) return i; } return -1; } //----------------------------------------------------------------------------- // Iterates over extractors //----------------------------------------------------------------------------- int CSFMPhonemeExtractor::GetAPICount() { return m_Extractors.Count(); } void CSFMPhonemeExtractor::GetAPIInfo( int index, CUtlString* pPrintName, PE_APITYPE *pAPIType ) { Assert( pPrintName ); Assert( pAPIType ); pPrintName->Set( m_Extractors[ index ].extractor->GetName() ); *pAPIType = m_Extractors[ index ].apitype; } static void ParseSentence( CSentence& sentence, IterateRIFF &walk ) { CUtlBuffer buf( 0, 0, CUtlBuffer::TEXT_BUFFER ); buf.EnsureCapacity( walk.ChunkSize() ); walk.ChunkRead( buf.Base() ); buf.SeekPut( CUtlBuffer::SEEK_HEAD, walk.ChunkSize() ); sentence.InitFromDataChunk( buf.Base(), buf.TellPut() ); } bool CSFMPhonemeExtractor::GetWaveFormat( const char *filename, CUtlBuffer *pBuf, int *pDataSize, CSentence& sentence, bool &bGotSentence ) { InFileRIFF riff( filename, *g_pFSIOReadBinary ); Assert( riff.RIFFName() == RIFF_WAVE ); // set up the iterator for the whole file (root RIFF is a chunk) IterateRIFF walk( riff, riff.RIFFSize() ); bool gotFmt = false; bool gotData = false; bGotSentence = false; // Walk input chunks and copy to output while ( walk.ChunkAvailable() ) { switch ( walk.ChunkName() ) { case WAVE_FMT: { pBuf->SeekPut( CUtlBuffer::SEEK_HEAD, walk.ChunkSize() ); walk.ChunkRead( pBuf->Base() ); gotFmt = true; } break; case WAVE_DATA: { *pDataSize = walk.ChunkSize(); gotData = true; } break; case WAVE_VALVEDATA: { bGotSentence = true; ParseSentence( sentence, walk ); } break; default: break; } // Done if ( gotFmt && gotData && bGotSentence ) return true; walk.ChunkNext(); } return ( gotFmt && gotData ); } bool CSFMPhonemeExtractor::GetSentence( CDmeGameSound *gameSound, CSentence& sentence ) { const char *filename = gameSound->m_SoundName.Get(); Assert( filename && filename [ 0 ] ); char soundname[ 512 ]; // Note, calling PSkipSoundChars to remove any decorator characters used by the engine!!! Q_snprintf( soundname, sizeof( soundname ), "sound/%s", PSkipSoundChars( filename ) ); Q_FixSlashes( soundname ); char fullpath[ 512 ]; g_pFullFileSystem->RelativePathToFullPath( soundname, "GAME", fullpath, sizeof( fullpath ) ); // Get sound file metrics of interest CUtlBuffer buf; int nDataSize; bool bValidSentence = false; if ( !GetWaveFormat( soundname, &buf, &nDataSize, sentence, bValidSentence ) ) return false; return bValidSentence; } static void BuildPhonemeStream( CSentence& in, CUtlVector< CBasePhonemeTag * >& list ) { for ( int i = 0; i < in.m_Words.Count(); ++i ) { CWordTag *w = in.m_Words[ i ]; if ( !w ) continue; for ( int j = 0; j < w->m_Phonemes.Count(); ++j ) { CPhonemeTag *ph = w->m_Phonemes[ j ]; if ( !ph ) continue; CBasePhonemeTag *newTag = new CBasePhonemeTag( *ph ); list.AddToTail( newTag ); } } if ( !in.m_Words.Count() && in.m_RunTimePhonemes.Count() ) { for ( int i = 0 ; i < in.m_RunTimePhonemes.Count(); ++i ) { CBasePhonemeTag *newTag = new CBasePhonemeTag( *in.m_RunTimePhonemes[ i ] ); list.AddToTail( newTag ); } } } //----------------------------------------------------------------------------- // Purpose: Same the phoneme data into the sound files //----------------------------------------------------------------------------- static void StoreValveDataChunk( CSentence& sentence, IterateOutputRIFF& store ) { // Buffer and dump data CUtlBuffer buf( 0, 0, CUtlBuffer::TEXT_BUFFER ); sentence.SaveToBuffer( buf ); // Copy into store store.ChunkWriteData( buf.Base(), buf.TellPut() ); } static bool SaveSentenceToWavFile( const char *pWavFile, CSentence& sentence ) { char pTempFile[ 512 ]; Q_StripExtension( pWavFile, pTempFile, sizeof( pTempFile ) ); Q_DefaultExtension( pTempFile, ".tmp", sizeof( pTempFile ) ); if ( g_pFullFileSystem->FileExists( pTempFile, "GAME" ) ) { g_pFullFileSystem->RemoveFile( pTempFile, "GAME" ); } CP4AutoEditAddFile p4Checkout( pWavFile ); if ( !g_pFullFileSystem->IsFileWritable( pWavFile ) ) { Warning( "%s is not writable, can't save sentence data to file\n", pWavFile ); return false; } // Rename original pWavFile to temp g_pFullFileSystem->RenameFile( pWavFile, pTempFile, "GAME" ); // NOTE: Put this in it's own scope so that the destructor for outfileRFF actually closes the file!!!! { // Read from Temp InFileRIFF riff( pTempFile, *g_pFSIOReadBinary ); Assert( riff.RIFFName() == RIFF_WAVE ); // set up the iterator for the whole file (root RIFF is a chunk) IterateRIFF walk( riff, riff.RIFFSize() ); // And put data back into original pWavFile by name OutFileRIFF riffout( pWavFile, *g_pFSIOWriteBinary ); IterateOutputRIFF store( riffout ); bool bWordTrackWritten = false; // Walk input chunks and copy to output while ( walk.ChunkAvailable() ) { store.ChunkStart( walk.ChunkName() ); switch ( walk.ChunkName() ) { case WAVE_VALVEDATA: { // Overwrite data StoreValveDataChunk( sentence, store ); bWordTrackWritten = true; } break; default: store.CopyChunkData( walk ); break; } store.ChunkFinish(); walk.ChunkNext(); } // If we didn't write it above, write it now if ( !bWordTrackWritten ) { store.ChunkStart( WAVE_VALVEDATA ); StoreValveDataChunk( sentence, store ); store.ChunkFinish(); } } // Remove temp file g_pFullFileSystem->RemoveFile( pTempFile, NULL ); return true; } //----------------------------------------------------------------------------- // Main entry point for phoneme extraction //----------------------------------------------------------------------------- void CSFMPhonemeExtractor::Extract( const PE_APITYPE& apiType, ExtractDesc_t& info, bool bWritePhonemesToWavFiles ) { if ( !info.m_pSet ) return; int iExtractor = FindExtractor( apiType ); if ( iExtractor == -1 ) return; Extractor& extractor = m_Extractors[ iExtractor ]; int nWorkItem; for ( nWorkItem = 0; nWorkItem < info.m_WorkList.Count(); ++nWorkItem ) { CExtractInfo& workItem = info.m_WorkList[ nWorkItem ]; workItem.m_flDuration = 0.0f; CSentence in; CSentence out; in.SetText( workItem.m_sHintText.String() ); out.SetText( workItem.m_sHintText.String() ); const char *pFileName = workItem.m_pSound->m_SoundName.Get(); Assert( pFileName && pFileName [ 0 ] ); char pSoundName[ 512 ]; // Note, calling PSkipSoundChars to remove any decorator characters used by the engine!!! Q_snprintf( pSoundName, sizeof( pSoundName ), "sound/%s", PSkipSoundChars( pFileName ) ); Q_FixSlashes( pSoundName ); char pFullPath[ 512 ]; g_pFullFileSystem->RelativePathToFullPath( pSoundName, "GAME", pFullPath, sizeof( pFullPath ) ); // Get sound file metrics of interest CUtlBuffer buf; WAVEFORMATEX *format; int nDataSize; if ( !GetWaveFormat( pSoundName, &buf, &nDataSize, workItem.m_Sentence, workItem.m_bSentenceValid ) ) continue; format = ( WAVEFORMATEX * )buf.Base(); if ( !( format->wBitsPerSample > ( 1 << 3 ) ) ) { // Have to warn and early-out here to avoid crashing with "integer divide by zero" below Warning( "Cannot extract phonemes from '%s', %u bits per sample.\n", pSoundName, format->wBitsPerSample ); continue; } int nBitsPerSample = format->wBitsPerSample; float flSampleRate = (float)format->nSamplesPerSec; int nChannels = format->nChannels; int nSampleCount = nDataSize / ( nBitsPerSample >> 3 ); float flTrueSampleSize = ( nBitsPerSample * nChannels ) >> 3; if ( format->wFormatTag == WAVE_FORMAT_ADPCM ) { nBitsPerSample = 16; flTrueSampleSize = 0.5f; ADPCMWAVEFORMAT *pFormat = (ADPCMWAVEFORMAT *)buf.Base(); int blockSize = ((pFormat->wSamplesPerBlock - 2) * pFormat->wfx.nChannels ) / 2; blockSize += 7 * pFormat->wfx.nChannels; int blockCount = nDataSize / blockSize; int blockRem = nDataSize % blockSize; // total samples in complete blocks nSampleCount = blockCount * pFormat->wSamplesPerBlock; // add remaining in a short block if ( blockRem ) { nSampleCount += pFormat->wSamplesPerBlock - (((blockSize - blockRem) * 2) / nChannels); } } if ( flSampleRate > 0.0f ) { workItem.m_flDuration = (float)nSampleCount / flSampleRate; } in.CreateEventWordDistribution( workItem.m_sHintText.String(), workItem.m_flDuration ); if ( !workItem.m_bUseSentence || !workItem.m_bSentenceValid ) { extractor.extractor->Extract( pFullPath, (int)( workItem.m_flDuration * flSampleRate * flTrueSampleSize ), Msg, in, out ); // Tracker 57389: // Total hack to fix a bug where the Lipsinc extractor is messing up the # channels on 16 bit stereo waves if ( apiType == SPEECH_API_LIPSINC && nChannels == 2 && nBitsPerSample == 16 ) { flTrueSampleSize *= 2.0f; } float bytespersecond = flSampleRate * flTrueSampleSize; int i; // Now convert byte offsets to times for ( i = 0; i < out.m_Words.Size(); i++ ) { CWordTag *tag = out.m_Words[ i ]; Assert( tag ); if ( !tag ) continue; tag->m_flStartTime = ( float )(tag->m_uiStartByte ) / bytespersecond; tag->m_flEndTime = ( float )(tag->m_uiEndByte ) / bytespersecond; for ( int j = 0; j < tag->m_Phonemes.Size(); j++ ) { CPhonemeTag *ptag = tag->m_Phonemes[ j ]; Assert( ptag ); if ( !ptag ) continue; ptag->SetStartTime( ( float )(ptag->m_uiStartByte ) / bytespersecond ); ptag->SetEndTime( ( float )(ptag->m_uiEndByte ) / bytespersecond ); } } if ( bWritePhonemesToWavFiles ) { SaveSentenceToWavFile( pFullPath, out ); } } else { Msg( "Using .wav file phonemes for (%s)\n", pSoundName ); out = workItem.m_Sentence; } // Now create channel data workItem.ClearTags(); BuildPhonemeStream( out, workItem.m_ApplyTags ); } if ( info.m_bCreateBookmarks ) { info.m_pSet->GetBookmarks().RemoveAll(); } for ( nWorkItem = 0; nWorkItem < info.m_WorkList.Count(); ++nWorkItem ) { LogPhonemes( nWorkItem, info ); } } //----------------------------------------------------------------------------- // //----------------------------------------------------------------------------- static bool UniquePhonemeLessFunc( CBasePhonemeTag * const & lhs, CBasePhonemeTag * const & rhs ) { return lhs->GetPhonemeCode() < rhs->GetPhonemeCode(); } void CSFMPhonemeExtractor::BuildPhonemeToPresetMapping( const CUtlVector< CBasePhonemeTag * > &stream, CDmeAnimationSet *pSet, CDmePresetGroup *pPresetGroup, CUtlDict< CDmePreset *, unsigned short > &phonemeToPresetDict ) { int i; CUtlRBTree< CBasePhonemeTag * > uniquePhonemes( 0, 0, UniquePhonemeLessFunc ); for ( i = 0; i < stream.Count(); ++i ) { CBasePhonemeTag *tag = stream[ i ]; if ( uniquePhonemes.Find( tag ) == uniquePhonemes.InvalidIndex() ) { uniquePhonemes.Insert( tag ); } } for ( i = uniquePhonemes.FirstInorder(); i != uniquePhonemes.InvalidIndex(); i = uniquePhonemes.NextInorder( i ) ) { CBasePhonemeTag *tag = uniquePhonemes[ i ]; // Convert phoneme code to text char ph[ 32 ]; Q_strncpy( ph, ConvertPhoneme( tag->GetPhonemeCode() ), sizeof( ph ) ); char remappedph[ 32 ]; // By default we search for a preset name p_xxx where xxx is the phoneme string Q_snprintf( remappedph, sizeof( remappedph ), "p_%s", ph ); // Now find the preset in the animation set converter CDmePhonemeMapping *mapping = pSet->FindMapping( ph ); if ( mapping ) { Q_strncpy( remappedph, mapping->GetValueString( "preset" ), sizeof( remappedph ) ); } // Now look up the preset, if it exists CDmePreset *preset = pPresetGroup->FindPreset( remappedph ); if ( !preset ) { Warning( "Animation set '%s' missing phoneme preset for '%s' -> '%s'\n", pSet->GetName(), ph, remappedph ); continue; } // Add to dictionary if it's not already there if ( phonemeToPresetDict.Find( ph ) == phonemeToPresetDict.InvalidIndex() ) { phonemeToPresetDict.Insert( ph, preset ); } } } //----------------------------------------------------------------------------- // Finds the channels clip which refers to facial control values //----------------------------------------------------------------------------- CDmeChannelsClip* CSFMPhonemeExtractor::FindFacialChannelsClip( const CUtlVector< LogPreview_t > &list ) { CDmeChannelsClip *pChannelsClip = NULL; int i; for ( i = list.Count() - 1; i >= 0; --i ) { const LogPreview_t &lp = list[i]; CDmeChannelsClip *check = FindAncestorReferencingElement< CDmeChannelsClip >( (CDmElement *)lp.m_hChannels[ 0 ].Get() ); if ( !pChannelsClip && check ) { pChannelsClip = check; } else { if ( pChannelsClip != check ) { Warning( "Selected controls overlap multiple channels clips!!!\n" ); } } } if ( !pChannelsClip ) { Warning( "Unable to determine destination channels clip!!!\n" ); } return pChannelsClip; } //----------------------------------------------------------------------------- // Builds the list of logs which target facial control values //----------------------------------------------------------------------------- void CSFMPhonemeExtractor::BuildPhonemeLogList( CUtlVector< LogPreview_t > &list, CUtlVector< CDmeLog * > &logs ) { for ( int i = 0; i < list.Count(); ++i ) { LogPreview_t& p = list[ i ]; for ( int channel = 0; channel < LOG_PREVIEW_FLEX_CHANNEL_COUNT; ++channel ) { CDmeChannel *ch = p.m_hChannels[ channel ]; if ( !ch ) continue; CDmeLog *log = p.m_hChannels[ channel ]->GetLog(); if ( !log ) continue; logs.AddToTail( log ); } } } //----------------------------------------------------------------------------- // Writes default values into all log layers targetting facial control values //----------------------------------------------------------------------------- void CSFMPhonemeExtractor::WriteDefaultValuesIntoLogLayers( DmeTime_t tHeadPosition, const CUtlDict< LogPreview_t *, int > &controlLookup ) { // Write a zero into all relevant log layers for ( int j = controlLookup.First(); j != controlLookup.InvalidIndex(); j = controlLookup.Next( j ) ) { LogPreview_t* lp = controlLookup[ j ]; CDmElement *pControl = lp->m_hControl; for ( int chIndex = 0; chIndex < LOG_PREVIEW_FLEX_CHANNEL_COUNT; ++chIndex ) { CDmeChannel *pChannel = lp->m_hChannels[ chIndex ]; if ( !pChannel ) continue; // Now get the log for the channel CDmeFloatLog *pFloatLog = CastElement< CDmeFloatLog >( pChannel->GetLog() ); if ( !pFloatLog ) continue; CDmeFloatLogLayer *pLayer = pFloatLog->GetLayer( pFloatLog->GetTopmostLayer() ); if ( !pLayer ) continue; float flDefaultValue = pControl->GetValue< float >( s_pDefaultAttributeValueNames[chIndex] ); pLayer->InsertKey( tHeadPosition, flDefaultValue ); } } } //----------------------------------------------------------------------------- // Creates a new log key based on the interpolated value at that time //----------------------------------------------------------------------------- void CSFMPhonemeExtractor::WriteCurrentValuesIntoLogLayers( DmeTime_t tHeadPosition, const CUtlDict< LogPreview_t *, int > &controlLookup ) { // Write a zero into all relevant log layers for ( int j = controlLookup.First(); j != controlLookup.InvalidIndex(); j = controlLookup.Next( j ) ) { LogPreview_t* lp = controlLookup[ j ]; for ( int chIndex = 0; chIndex < LOG_PREVIEW_FLEX_CHANNEL_COUNT; ++chIndex ) { CDmeChannel *pChannel = lp->m_hChannels[ chIndex ]; if ( !pChannel ) continue; // Now get the log for the channel CDmeFloatLog *pFloatLog = CastElement< CDmeFloatLog >( pChannel->GetLog() ); if ( !pFloatLog ) continue; CDmeFloatLogLayer *pLayer = pFloatLog->GetLayer( pFloatLog->GetTopmostLayer() ); if ( !pLayer ) continue; float flCurrentValue = pLayer->GetValue( tHeadPosition ); pLayer->InsertKey( tHeadPosition, flCurrentValue ); } } } //----------------------------------------------------------------------------- // Samples extracted phoneme data and stamps that values into control value logs //----------------------------------------------------------------------------- void CSFMPhonemeExtractor::StampControlValueLogs( CDmePreset *preset, DmeTime_t tHeadPosition, float flIntensity, CUtlDict< LogPreview_t *, int > &controlLookup ) { // Now walk the logs required by the preset const CDmrElementArray< CDmElement > &controlValues = preset->GetControlValues( ); for ( int j = 0; j < controlValues.Count(); ++j ) { // This control contains the preset value CDmElement *presetControl = controlValues[ j ]; if ( !presetControl ) continue; int visIndex = controlLookup.Find( presetControl->GetName() ); if ( visIndex == controlLookup.InvalidIndex() ) continue; LogPreview_t* lp = controlLookup[ visIndex ]; for ( int chIndex = 0; chIndex < LOG_PREVIEW_FLEX_CHANNEL_COUNT; ++chIndex ) { CDmeChannel *ch = lp->m_hChannels[ chIndex ]; if ( !ch ) continue; // Whereas this control contains the "default" value for the slider (since the presetControl won't have that value) CDmElement *defaultValueControl = lp->m_hControl.Get(); if ( !defaultValueControl ) continue; // Now get the log for the channel CDmeLog *log = ch->GetLog(); if ( !log ) { Assert( 0 ); continue; } CDmeFloatLog *floatLog = CastElement< CDmeFloatLog >( log ); if ( !floatLog ) continue; CDmeFloatLogLayer *pLayer = floatLog->GetLayer( floatLog->GetTopmostLayer() ); if ( !pLayer ) continue; float flDefault = defaultValueControl->GetValue< float >( s_pDefaultAttributeValueNames[chIndex] ); float flControlValue = presetControl->GetValue< float >( s_pAttributeValueNames[ chIndex ] ); float flNewValue = flIntensity * ( flControlValue - flDefault ); float flCurrent = pLayer->GetValue( tHeadPosition ) - flDefault; // Accumulate new value into topmost layer pLayer->InsertKey( tHeadPosition, flCurrent + flNewValue + flDefault ); } } } void CSFMPhonemeExtractor::ClearInterstitialSpaces( CDmeChannelsClip *pChannelsClip, CUtlDict< LogPreview_t *, int >& controlLookup, ExtractDesc_t& info ) { Assert( info.m_pShot ); Assert( pChannelsClip ); if ( info.m_WorkList.Count() == 0 ) return; // This is handled by the main layering code... if ( info.m_nExtractType == EXTRACT_WIPE_SOUNDS ) return; // Now walk through all relevant logs CUtlVector< CDmeLog * > logs; BuildPhonemeLogList( info.m_ControlList, logs ); DmeTime_t tMinTime( DMETIME_MAXTIME ); DmeTime_t tMaxTime( DMETIME_MINTIME ); int i; // Walk work items and figure out time bounds for ( i = 0; i < info.m_WorkList.Count(); ++i ) { CExtractInfo &item = info.m_WorkList[ i ]; CUtlVector< CDmeHandle< CDmeClip > > srcStack; CUtlVector< CDmeHandle< CDmeClip > > dstStack; // Convert original .wav start to animation set channels clip relative time item.m_pClip->BuildClipStack( &srcStack, info.m_pMovie, info.m_pShot ); // NOTE: Time bounds measured in sound media time goes from 0 -> flWaveDuration DmeTime_t tSoundMediaStartTime = CDmeClip::FromChildMediaTime( srcStack, DMETIME_ZERO, false ); DmeTime_t tSoundMediaEndTime = CDmeClip::FromChildMediaTime( srcStack, DmeTime_t( item.m_flDuration ), false ); // NOTE: Start and end time are measured in sound media time DmeTime_t tStartTime = item.m_pClip->GetStartInChildMediaTime(); DmeTime_t tEndTime = item.m_pClip->GetEndInChildMediaTime(); // And convert back down into channels clip relative time pChannelsClip->BuildClipStack( &dstStack, info.m_pMovie, info.m_pShot ); // Now convert back down to channels clip relative time DmeTime_t tChannelMediaStartTime = CDmeClip::ToChildMediaTime( dstStack, tSoundMediaStartTime, false ); DmeTime_t tChannelMediaEndTime = CDmeClip::ToChildMediaTime( dstStack, tSoundMediaEndTime, false ); // Find a scale + offset which transforms data in media space of the sound [namely, the phonemes] // into the media space of the channels [the logs that drive the facial animation] DmeTime_t tEndDuration = tChannelMediaEndTime - tChannelMediaStartTime; double flScale = ( item.m_flDuration != 0.0f ) ? tEndDuration.GetSeconds() / item.m_flDuration : 0.0f; DmeTime_t tOffset = tChannelMediaStartTime; DmeTime_t tChannelRelativeStartTime( tStartTime * flScale ); tChannelRelativeStartTime += tOffset; DmeTime_t tChannelRelativeEndTime( tEndTime * flScale ); tChannelRelativeEndTime += tOffset; if ( tChannelRelativeStartTime < tMinTime ) { tMinTime = tChannelRelativeStartTime; } if ( tChannelRelativeEndTime > tMaxTime ) { tMaxTime = tChannelRelativeEndTime; } } // Bloat by one quantum tMinTime -= DMETIME_MINDELTA; tMaxTime += DMETIME_MINDELTA; for ( i = 0; i < logs.Count(); ++i ) { CDmeLog *log = logs[ i ]; Assert( log->GetNumLayers() == 1 ); CDmeLogLayer *layer = log->GetLayer( log->GetTopmostLayer() ); if ( info.m_nExtractType == EXTRACT_WIPE_RANGE ) { // Write default value keys into log // Write a default value at that time WriteDefaultValuesIntoLogLayers( tMinTime, controlLookup ); // Write a default value at that time WriteDefaultValuesIntoLogLayers( tMaxTime, controlLookup ); // Now discard all keys > tMinTime and < tMaxTime for ( int j = layer->GetKeyCount() - 1; j >= 0; --j ) { DmeTime_t &t = layer->GetKeyTime( j ); if ( t <= tMinTime ) continue; if ( t >= tMaxTime ) continue; layer->RemoveKey( j ); } } else { Assert( info.m_nExtractType == EXTRACT_WIPE_CLIP ); layer->ClearKeys(); } } } void AddAnimSetBookmarkAtSoundMediaTime( const char *pName, DmeTime_t tStart, DmeTime_t tEnd, const CUtlVector< CDmeHandle< CDmeClip > > &srcStack, ExtractDesc_t& info ) { tStart = CDmeClip::FromChildMediaTime( srcStack, tStart, false ); tEnd = CDmeClip::FromChildMediaTime( srcStack, tEnd, false ); tStart = info.m_pShot->ToChildMediaTime( tStart, false ); tEnd = info.m_pShot->ToChildMediaTime( tEnd, false ); CDmeBookmark *pBookmark = CreateElement< CDmeBookmark >( pName ); pBookmark->SetNote( pName ); pBookmark->SetTime( tStart ); pBookmark->SetDuration( tEnd - tStart ); info.m_pSet->GetBookmarks().AddToTail( pBookmark ); } //----------------------------------------------------------------------------- // Main entry point for generating phoneme logs //----------------------------------------------------------------------------- void CSFMPhonemeExtractor::LogPhonemes( int nItemIndex, ExtractDesc_t& info ) { CExtractInfo &item = info.m_WorkList[ nItemIndex ]; // Validate input parameters Assert( info.m_pSet && item.m_pClip && item.m_pSound ); if ( !info.m_pSet || !item.m_pClip || !item.m_pSound ) return; CDmePresetGroup *pPresetGroup = info.m_pSet->FindPresetGroup( "phoneme" ); if ( !pPresetGroup ) { Warning( "Animation set '%s' missing preset group 'phoneme'\n", info.m_pSet->GetName() ); return; } if ( !info.m_pSet->GetPhonemeMap().Count() ) { info.m_pSet->RestoreDefaultPhonemeMap(); } // Walk through phoneme stack and build list of unique presets CUtlDict< CDmePreset *, unsigned short > phonemeToPresetDict; BuildPhonemeToPresetMapping( item.m_ApplyTags, info.m_pSet, pPresetGroup, phonemeToPresetDict ); CDmeChannelsClip *pChannelsClip = FindFacialChannelsClip( info.m_ControlList ); if ( !pChannelsClip ) return; // Build a fast lookup of the visible sliders int i; CUtlDict< LogPreview_t *, int > controlLookup; for ( i = 0; i < info.m_ControlList.Count(); ++i ) { controlLookup.Insert( info.m_ControlList[ i ].m_hControl->GetName(), &info.m_ControlList[ i ] ); } // Only need to do this on the first item and we have multiple .wavs selected if ( nItemIndex == 0 && info.m_WorkList.Count() > 1 ) { ClearInterstitialSpaces( pChannelsClip, controlLookup, info ); } // Set up time selection, put channels into record and stamp out keyframes // Convert original .wav start to animation set channels clip relative time CUtlVector< CDmeHandle< CDmeClip > > srcStack; item.m_pClip->BuildClipStack( &srcStack, info.m_pMovie, info.m_pShot ); if ( srcStack.Count() == 0 ) { item.m_pClip->BuildClipStack( &srcStack, info.m_pMovie, NULL ); if ( srcStack.Count() == 0 ) { Msg( "Couldn't build stack sound clip to current shot\n" ); return; } } // NOTE: Time bounds measured in sound media time goes from 0 -> flWaveDuration DmeTime_t tSoundMediaStartTime = CDmeClip::FromChildMediaTime( srcStack, DMETIME_ZERO, false ); DmeTime_t tSoundMediaEndTime = CDmeClip::FromChildMediaTime( srcStack, DmeTime_t( item.m_flDuration ), false ); // NOTE: Start and end time are measured in sound media time DmeTime_t tStartTime = item.m_pClip->GetStartInChildMediaTime(); DmeTime_t tEndTime = item.m_pClip->GetEndInChildMediaTime(); // And convert back down into channels clip relative time CUtlVector< CDmeHandle< CDmeClip > > dstStack; pChannelsClip->BuildClipStack( &dstStack, info.m_pMovie, info.m_pShot ); // Now convert back down to channels clip relative time DmeTime_t tChannelMediaStartTime = CDmeClip::ToChildMediaTime( dstStack, tSoundMediaStartTime, false ); DmeTime_t tChannelMediaEndTime = CDmeClip::ToChildMediaTime( dstStack, tSoundMediaEndTime, false ); // Find a scale + offset which transforms data in media space of the sound [namely, the phonemes] // into the media space of the channels [the logs that drive the facial animation] DmeTime_t tEndDuration = tChannelMediaEndTime - tChannelMediaStartTime; double flScale = ( item.m_flDuration != 0.0f ) ? tEndDuration.GetSeconds() / item.m_flDuration : 0.0f; DmeTime_t tOffset = tChannelMediaStartTime; CUtlVector< CDmeLog * > logs; BuildPhonemeLogList( info.m_ControlList, logs ); // Add new write layer to each recording log for ( i = 0; i < logs.Count(); ++i ) { logs[ i ]->AddNewLayer(); } // Iterate over the entire range of the sound double flStartSoundTime = max( 0, tStartTime.GetSeconds() ); double flEndSoundTime = min( item.m_flDuration, tEndTime.GetSeconds() ); // Stamp keys right before and after the sound so as to // not generate new values outside the import time range DmeTime_t tPrePhonemeTime( flStartSoundTime * flScale ); tPrePhonemeTime += tOffset - DMETIME_MINDELTA; WriteCurrentValuesIntoLogLayers( tPrePhonemeTime, controlLookup ); DmeTime_t tPostPhonemeTime( flEndSoundTime * flScale ); tPostPhonemeTime += tOffset + DMETIME_MINDELTA; WriteCurrentValuesIntoLogLayers( tPostPhonemeTime, controlLookup ); // add bookmarks if ( info.m_bCreateBookmarks ) { AddAnimSetBookmarkAtSoundMediaTime( "start", tPrePhonemeTime, tPrePhonemeTime, srcStack, info ); for ( i = 0; i < item.m_ApplyTags.Count() ; ++i ) { CBasePhonemeTag *p = item.m_ApplyTags[ i ]; const char *pPhonemeName = ConvertPhoneme( p->GetPhonemeCode() ); DmeTime_t tStart = DmeTime_t( p->GetStartTime() ); DmeTime_t tEnd = DmeTime_t( p->GetEndTime() ); AddAnimSetBookmarkAtSoundMediaTime( pPhonemeName, tStart, tEnd, srcStack, info ); } AddAnimSetBookmarkAtSoundMediaTime( "end", tPostPhonemeTime, tPostPhonemeTime, srcStack, info ); } if ( info.m_nFilterType == EXTRACT_FILTER_HOLD || info.m_nFilterType == EXTRACT_FILTER_LINEAR ) { CDmePreset *pLastPreset = NULL; for ( i = 0; i < item.m_ApplyTags.Count() ; ++i ) { CBasePhonemeTag *p = item.m_ApplyTags[ i ]; DmeTime_t tStart = DmeTime_t( p->GetStartTime() ); DmeTime_t tEnd = DmeTime_t( p->GetEndTime() ); int idx = phonemeToPresetDict.Find( ConvertPhoneme( p->GetPhonemeCode() ) ); if ( idx == phonemeToPresetDict.InvalidIndex() ) continue; CDmePreset *preset = phonemeToPresetDict[ idx ]; if ( !preset ) continue; DmeTime_t tKeyTime = tStart * flScale + tOffset; if ( info.m_nFilterType == EXTRACT_FILTER_HOLD ) { // stamp value at end of phoneme (or default prior to first phoneme) // NOTE - this ignores phoneme length, but since all phonemes directly abut one another, this doesn't matter DmeTime_t tLastEnd = tKeyTime - DMETIME_MINDELTA; if ( tLastEnd > tPrePhonemeTime ) { WriteDefaultValuesIntoLogLayers( tKeyTime - DMETIME_MINDELTA, controlLookup ); if ( pLastPreset ) { StampControlValueLogs( pLastPreset, tKeyTime - DMETIME_MINDELTA, 1.0f, controlLookup ); } } pLastPreset = preset; } WriteDefaultValuesIntoLogLayers( tKeyTime, controlLookup ); StampControlValueLogs( preset, tKeyTime, 1.0f, controlLookup ); if ( info.m_nFilterType == EXTRACT_FILTER_HOLD && i == item.m_ApplyTags.Count() - 1 ) { // stamp value at end of last phoneme tKeyTime = tEnd * flScale + tOffset; tKeyTime = min( tKeyTime, tPostPhonemeTime ); WriteDefaultValuesIntoLogLayers( tKeyTime - DMETIME_MINDELTA, controlLookup ); StampControlValueLogs( preset, tKeyTime - DMETIME_MINDELTA, 1.0f, controlLookup ); // stamp default just after end of last phoneme to hold silence until tPostPhonemeTime WriteDefaultValuesIntoLogLayers( tKeyTime, controlLookup ); } } } else { Assert( info.m_nFilterType == EXTRACT_FILTER_FIXED_WIDTH ); double tStep = 1.0 / (double)clamp( info.m_flSampleRateHz, 1.0f, 1000.0f ); float flFilter = max( info.m_flSampleFilterSize, 0.001f ); float flOOFilter = 1.0f / flFilter; for ( double t = flStartSoundTime; t < flEndSoundTime; t += tStep ) { DmeTime_t tPhonemeTime( t ); // Determine the location of the sample in the channels clip DmeTime_t tKeyTime( t * flScale ); tKeyTime += tOffset; // Write a default value at that time WriteDefaultValuesIntoLogLayers( tKeyTime, controlLookup ); // Walk phonemes... for ( i = 0; i < item.m_ApplyTags.Count() ; ++i ) { CBasePhonemeTag *p = item.m_ApplyTags[ i ]; DmeTime_t tStart = DmeTime_t( p->GetStartTime() ); DmeTime_t tEnd = DmeTime_t( p->GetEndTime() ); bool bContinue = false; float flI = 0.0f; { DmeTime_t tFilter( flFilter ); if ( tStart >= tPhonemeTime + tFilter || tEnd <= tPhonemeTime ) bContinue = true; tStart = max( tStart, tPhonemeTime ); tEnd = min( tEnd, tPhonemeTime + tFilter ); flI = ( tEnd - tStart ).GetSeconds() * flOOFilter; } DmeTime_t dStart = tStart - tPhonemeTime; DmeTime_t dEnd = tEnd - tPhonemeTime; float t1 = dStart.GetSeconds() * flOOFilter; float t2 = dEnd.GetSeconds() * flOOFilter; Assert( bContinue == !( t1 < 1.0f && t2 > 0.0f ) ); if ( !( t1 < 1.0f && t2 > 0.0f ) ) continue; if ( t2 > 1 ) { t2 = 1; } if ( t1 < 0 ) { t1 = 0; } float flIntensity = ( t2 - t1 ); Assert( fabs( flI - flIntensity ) < 0.000001f ); int idx = phonemeToPresetDict.Find( ConvertPhoneme( p->GetPhonemeCode() ) ); if ( idx == phonemeToPresetDict.InvalidIndex() ) continue; CDmePreset *preset = phonemeToPresetDict[ idx ]; if ( !preset ) continue; StampControlValueLogs( preset, tKeyTime, flIntensity, controlLookup ); } } } // Flatten write layers for ( i = 0; i < logs.Count(); ++i ) { logs[ i ]->FlattenLayers( DMELOG_DEFAULT_THRESHHOLD, CDmeLog::FLATTEN_NODISCONTINUITY_FIXUP ); } } void CSFMPhonemeExtractor::ReApply( ExtractDesc_t& info ) { if ( info.m_bCreateBookmarks ) { info.m_pSet->GetBookmarks().RemoveAll(); } for ( int nWorkItem = 0; nWorkItem < info.m_WorkList.Count(); ++nWorkItem ) { LogPhonemes( nWorkItem, info ); } }