Commit c99f4e0d authored by jwendt's avatar jwendt
Browse files

TTS implemented using correct resampling from 48kHz to 44.1kHz

parent 40065413
......@@ -316,6 +316,7 @@ add_definitions( "-DVACORE_CMAKE_DATE=\"${VACORE_CMAKE_DATE}\"" )
if( ITA_VACORE_WITH_TTS_SIGNAL_SOURCE )
vista_use_package( CereVoice REQUIRED FIND_DEPENDENCIES)
vista_use_package( samplerate REQUIRED FIND_DEPENDENCIES)
if( VCEREVOICE_FOUND )
add_definitions( "-DVACORE_WITH_TTS_SIGNAL_SOURCE" )
endif( )
......
......@@ -19,10 +19,13 @@
#include <VistaTools/VistaFileSystemDirectory.h>
#ifdef VACORE_WITH_TTS_SIGNAL_SOURCE
#include <cerevoice_eng.h>
#endif
//#define TTS_USE_WAV
CVATextToSpeechSignalSource::CVATextToSpeechSignalSource( const double dSampleRate, const int iBlockLength )
: ITADatasourceRealization(1, dSampleRate, (unsigned int)(iBlockLength))
......@@ -94,7 +97,6 @@ const float* CVATextToSpeechSignalSource::GetStreamBlock( const CVAAudiostreamSt
// return pAudioFile->GetStreamBlock(pStreamInfo);
}
// @todo: take samples from generated WAV file
return m_sbOut.data();
}
......@@ -209,14 +211,14 @@ void CVATextToSpeechSignalSource::SetParameters( const CVAStruct& oParams )
if (oParams.HasKey("voice") && oParams["voice"].IsString())
sVoice = oParams["voice"];
bool direct_replay = false;
if (oParams.HasKey("direct_replay") && oParams["direct_replay"].IsBool() && oParams["direct_replay"]){
direct_replay = true;
bool direct_playback = false;
if (oParams.HasKey("direct_playback") && oParams["direct_playback"].IsBool() && oParams["direct_playback"]){
direct_playback = true;
}
std::string id = "tmp";
if (!oParams.HasKey("id") || !oParams["id"].IsString()){
if (!direct_replay){
if (!direct_playback){
VA_WARN("CVATextToSpeechSignalSource", "No id is given for the prepare speech request, the user application has to give an unique id.");
return;
}
......@@ -242,23 +244,35 @@ void CVATextToSpeechSignalSource::SetParameters( const CVAStruct& oParams )
CPRCEN_engine_channel_reset(TTSEngine::getInstance().getEngine(), chan);
CPRCEN_engine_clear_callback(TTSEngine::getInstance().getEngine(), chan);
CPRCEN_engine_set_callback(TTSEngine::getInstance().getEngine(), chan, (void*)&data, VisemeProcessing);
//CPRCEN_engine_channel_to_file(TTSEngine::getInstance().getEngine(), chan, "D:/work/tts.wav", CPRCEN_RIFF); /* File output on channel */
#ifdef TTS_USE_WAV
CPRCEN_engine_channel_to_file(TTSEngine::getInstance().getEngine(), chan, "D:/work/tts.wav", CPRCEN_RIFF); /* File output on channel */
#endif
CPRC_abuf* buf = CPRCEN_engine_channel_speak(TTSEngine::getInstance().getEngine(), chan, sText.c_str(), sText.length(), true);
int sRate = CPRC_abuf_wav_srate(buf);
std::cout << "CereVoice rate is: " << sRate << " while sound signal rate is: " << GetSampleRate() << std::endl;
data.visemes += "</speech>\n";
data.visemes += "<event id=\"" + id + "\" start=\"0\" message=\"speech started\" />";
//data.visemes += "<event id=\"" + id + "\" start=\"0\" message=\"speech started\" />";
m_Visemes[id] = data.visemes;
#ifdef TTS_USE_WAV
ITASampleFrame* pAudioBuffer = new ITASampleFrame("D:/work/tts.wav");
CITAAudioSample* pAudioSample = new CITAAudioSample();
//pAudioSample->Load(*pAudioBuffer, 44100.0f);
pAudioSample->Load(*pAudioBuffer, TTSEngine::getInstance().getSampleRate());
m_AudioSampleFrames[id] = pAudioSample;
#else
ITASampleFrame* pAudioBuffer = new ITASampleFrame();
pAudioBuffer->init(1, data.floatBuffer.size(), false);
(*pAudioBuffer)[0].write(&data.floatBuffer[0], data.floatBuffer.size());
m_AudioSampleFrames[id] = pAudioBuffer;
if (direct_replay){
CITAAudioSample* pAudioSample = new CITAAudioSample();
pAudioSample->Load(*pAudioBuffer, TTSEngine::getInstance().getSampleRate());
m_AudioSampleFrames[id] = pAudioSample;
#endif
if (direct_playback){
CVAStruct oParams_play;
oParams_play["play_speech"] = id;
oParams_play["free_after"] = true;
......@@ -305,6 +319,8 @@ void CVATextToSpeechSignalSource::SetParameters( const CVAStruct& oParams )
ITASampleFrame* pAudioBuffer = it->second;
VA_INFO("CVATextToSpeechSignalSource", "GetSAmpleRAte: " + std::to_string(GetSampleRate()) + " engine::rate " + std::to_string(TTSEngine::getInstance().getSampleRate()));
if (m_pBufferDataSource != NULL)
delete m_pBufferDataSource;
m_pBufferDataSource = new ITABufferDatasource((*pAudioBuffer)[0].data(), pAudioBuffer->length(), GetSampleRate(), GetBlocklength());
......@@ -426,6 +442,18 @@ CVATextToSpeechSignalSource::TTSEngine::TTSEngine(){
VA_INFO("CVATextToSpeechSignalSource", "Loaded voice \"" + name + "\"");
}
//now check for the sampled voices which sample rate we have
m_sampleRate = -1.0;
int num_voices = CPRCEN_engine_get_voice_count(getEngine());
for (int i = 0; i < num_voices; i++) {
std::string strSamplerate = CPRCEN_engine_get_voice_info(getEngine(), i, "SAMPLE_RATE");
float rate = std::stof(strSamplerate);
if (m_sampleRate < 0.0)
m_sampleRate = rate;
if (rate != m_sampleRate)
VA_WARN("CVATextToSpeechSignalSource", "Voices with different sample rates are used namely "+std::to_string(rate)+" and "+std::to_string(m_sampleRate));
}
SetupPhonemeMapping();
//std::string licence_file = voices_path + "cerevoice_heather_4.0.0_48k.lic";
......@@ -447,6 +475,10 @@ CPRCEN_engine* CVATextToSpeechSignalSource::TTSEngine::getEngine() const
return m_pTTSEngine;
}
float CVATextToSpeechSignalSource::TTSEngine::getSampleRate() const {
return m_sampleRate;
}
std::string CVATextToSpeechSignalSource::TTSEngine::PhonemeToViseme(std::string phoneme) {
auto it = m_phonemeToId.find(phoneme);
......
......@@ -8,6 +8,7 @@
#include <ITADataSourceRealization.h>
#include <ITASampleBuffer.h>
#include <ITAAudioSample.h>
#include <ITAAtomicPrimitives.h>
class ITABufferDatasource;
......@@ -15,6 +16,7 @@ class CVACoreImpl;
class CPRCEN_engine;
class CPRC_abuf;
/** Text-to-speech signal source
*
* The TTS signal source generates sound from text using external libraries, like TTSRelay for Windows platforms.
......@@ -41,7 +43,7 @@ public:
* ["prepare_text"] = text to be spoken
* ["id"] = identificator that will be used for playing this speech and reference it (must be unique)
* ["voice"] = the voice to be used //if none is given or the one given cannot be found the standard voice is used (i.e. "Heather")
* ["direct_replay"] = true/false whether the audio should directly be played (in this case no id has to be given, should not be used for lipsyncing)
* ["direct_playback"] = true/false whether the audio should directly be played (in this case no id has to be given, should not be used for lipsyncing)
* This should be used to start a prepared TTS using a CVAStruct with:
* ["play_speech"] = identificator (int) of created speech
......@@ -86,6 +88,7 @@ private:
void operator=(TTSEngine const&) = delete;
CPRCEN_engine* getEngine() const;
float getSampleRate() const;
std::string PhonemeToViseme(std::string phoneme);
private:
......@@ -94,6 +97,7 @@ private:
CPRCEN_engine* m_pTTSEngine; //you must not delete this from outside!!!!!!
std::map<std::string, int> m_phonemeToId;
std::map<int, std::string> m_idToViseme;
float m_sampleRate;
};
struct UserCallbackData{
......@@ -105,7 +109,7 @@ private:
static void VisemeProcessing(CPRC_abuf* abuf, void * userdata); //used as callback for the CereVoice engine
static std::string to_string_with_precision(float a_value, const int n = 3);
std::map<std::string, ITASampleFrame*> m_AudioSampleFrames;
std::map<std::string, CITAAudioSample*> m_AudioSampleFrames;
std::map<std::string, std::string> m_Visemes;
IVACore* m_pAssociatedCore;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment