Commit 8f7c2a77 authored by jwendt's avatar jwendt
Browse files

Included CereVoice Implementation for TTS

parent 24de9ee3
......@@ -84,7 +84,6 @@ if( NOT DEFINED ITA_VACORE_MAX_NUM_LISTENERS )
set( ITA_VACORE_MAX_NUM_LISTENERS 0 CACHE INT "VACore maximum number of listeners ( 0 = unlimited )" )
endif( )
if( NOT DEFINED ITA_VACORE_WITH_TTS_SIGNAL_SOURCE )
vista_use_package( TTSRelay QUIET )
set( ITA_VACORE_WITH_TTS_SIGNAL_SOURCE ${TTSReleay} CACHE BOOL "VACore with text-to-speech signal source support" )
endif( )
......@@ -316,7 +315,10 @@ string( TIMESTAMP VACORE_CMAKE_DATE "%Y-%m-%d" )
add_definitions( "-DVACORE_CMAKE_DATE=\"${VACORE_CMAKE_DATE}\"" )
if( ITA_VACORE_WITH_TTS_SIGNAL_SOURCE )
add_definitions( "-DVACORE_WITH_TTS_SIGNAL_SOURCE" )
vista_use_package( CereVoice REQUIRED FIND_DEPENDENCIES)
if( VCEREVOICE_FOUND )
add_definitions( "-DVACORE_WITH_TTS_SIGNAL_SOURCE" )
endif( )
endif( )
# Dev switches
......
......@@ -11,16 +11,60 @@
#include <sstream>
#include <assert.h>
#include <math.h>
#include <iomanip>
#include <VistaTools/VistaFileSystemDirectory.h>
#ifdef VACORE_WITH_TTS_SIGNAL_SOURCE
#include <cerevoice_eng.h>
#endif
std::map<std::string, int> CVATextToSpeechSignalSource::m_phonemeToId;
std::map<int, std::string> CVATextToSpeechSignalSource::m_idToViseme;
CVATextToSpeechSignalSource::CVATextToSpeechSignalSource( const double dSampleRate, const int iBlockLength )
: ITADatasourceRealization( 1, dSampleRate, (unsigned int)( iBlockLength ) )
, m_pAssociatedCore( NULL )
{
m_sbOut.Init( GetBlocklength(), true );
#ifdef VACORE_WITH_TTS_SIGNAL_SOURCE
std::string voices_path = CEREVOICE_VOICES_PATH;
VA_INFO("CVATextToSpeechSignalSource", "CereVoices voices are searched in \"" + voices_path + "\"");
VistaFileSystemDirectory voicesDir(voices_path);
if (!voicesDir.Exists()){
VA_WARN("CVATextToSpeechSignalSource", "The voices directory does not exist!");
return;
}
for (auto it = voicesDir.begin(); it != voicesDir.end(); ++it){
std::string name = (*it)->GetName();
std::size_t suffix_pos = name.find(".voice");
if (suffix_pos == std::string::npos)
continue;
std::string licence_file = name.substr(0,suffix_pos) + ".lic";
m_pTTSEngine = CPRCEN_engine_load(licence_file.c_str(), name.c_str());
VA_INFO("CVATextToSpeechSignalSource", "Loaded voice \"" + name + "\"");
}
SetupPhonemeMapping();
//std::string licence_file = voices_path + "cerevoice_heather_4.0.0_48k.lic";
//std::string voice_file = voices_path + "cerevoice_heather_4.0.0_48k.voice";
//m_pTTSEngine = CPRCEN_engine_load(licence_file.c_str(), voice_file.c_str());
#endif
}
CVATextToSpeechSignalSource::~CVATextToSpeechSignalSource()
{
#ifdef VACORE_WITH_TTS_SIGNAL_SOURCE
CPRCEN_engine_delete(m_pTTSEngine);
#endif
}
int CVATextToSpeechSignalSource::GetType() const
......@@ -35,7 +79,7 @@ std::string CVATextToSpeechSignalSource::GetTypeString() const
std::string CVATextToSpeechSignalSource::GetDesc() const
{
return std::string( "Creates a machine that can be started and stopped" );
return std::string( "Creates spoken text and facial movements from a text" );
}
IVACore* CVATextToSpeechSignalSource::GetAssociatedCore() const
......@@ -46,7 +90,7 @@ IVACore* CVATextToSpeechSignalSource::GetAssociatedCore() const
const float* CVATextToSpeechSignalSource::GetStreamBlock( const CVAAudiostreamState* pStreamInfo )
{
// This is the live update function that is called by the audio streaming.
// We will deilver either zeros for a quit talker, or take speech samples from
// We will deliver either zeros for a quit talker, or take speech samples from
// the WAv file or internal sample buffer.
m_sbOut.Zero();
......@@ -73,11 +117,32 @@ std::string CVATextToSpeechSignalSource::GetStateString() const
void CVATextToSpeechSignalSource::Reset()
{
VA_WARN("CVATextToSpeechSignalSource", "Reset is not yet implemented.");
}
CVAStruct CVATextToSpeechSignalSource::GetParameters( const CVAStruct& oArgs ) const
{
CVAStruct oRet;
if (oArgs.HasKey("list_voices") && oArgs["list_voices"]){
//list the available voices
int num_voices = CPRCEN_engine_get_voice_count(m_pTTSEngine);
oRet["number"] = num_voices;
for (int i = 0; i < num_voices; i++) {
std::string voicename = CPRCEN_engine_get_voice_info(m_pTTSEngine, i, "VOICE_NAME");
oRet["voice_" + std::to_string(i)] = voicename;
std::string language = CPRCEN_engine_get_voice_info(m_pTTSEngine, i, "LANGUAGE_CODE_ISO");
oRet["language_" + std::to_string(i)] = language;
std::string country = CPRCEN_engine_get_voice_info(m_pTTSEngine, i, "COUNTRY_CODE_ISO");
oRet["country_" + std::to_string(i)] = country;
std::string sex = CPRCEN_engine_get_voice_info(m_pTTSEngine, i, "SEX");
oRet["sex_" + std::to_string(i)] = sex;
}
return oRet;
}
oRet["ready_for_playback"] = false;
#ifndef VACORE_WITH_TTS_SIGNAL_SOURCE
oRet[ "error" ] = "TTS signal sources not activated in your VACore";
VA_WARN( "CVATextToSpeechSignalSource", "TTS signal sources was requested but is not activated in your VACore" );
......@@ -85,11 +150,48 @@ CVAStruct CVATextToSpeechSignalSource::GetParameters( const CVAStruct& oArgs ) c
if( oArgs.IsEmpty() )
{
oRet[ "info" ] = "Parameters of TTS signal source will be delivered through this struct";
oRet[ "preparation_process" ] = 0.9;
oRet[ "ready_for_playback" ] = false;
oRet[ "prepare_speech" ] = "Halleluja!";
oRet[ "prepare_text" ] = "Let me say whatever you want";
oRet[ "start_talking" ] = false;
VA_INFO("CVATextToSpeechSignalSource", "GetParameters called with empty argument, so a the argument CVAStruct was filled with example data.");
return oRet;
}
if (!oArgs.HasKey("prepare_text")){
if (oArgs.HasKey("start_talking")){
VA_WARN("CVATextToSpeechSignalSource", "For start_talking SetParameters() must be used.");
}
else{
VA_WARN("CVATextToSpeechSignalSource", "Could not interpret parameters for text-to-speech signal source getter method, use empty getter for help.");
}
return oRet;
}
if (!oArgs["prepare_text"].IsString())
VA_EXCEPT2(INVALID_PARAMETER, "Text of speech (prepare_text) has to be a string");
std::string sText = oArgs["prepare_text"];
std::string sVoice = oArgs["voice"];
// Prepare a WAV file or sample buffer with text-to-speech engine output
VA_INFO("CVATextToSpeechSignalSource", "VA received TTS command, to say \"" + sText + "\"");
std::string visemes = "<speech type=\"text/plain\">\n";
//std::string visemes = "<?xml version=\"1.0\" encoding=\"UTF - 8\"?>\n<speak>\n";
std::pair<std::string, float> visemes_lastTime = std::make_pair(visemes, 0.0f);
CPRCEN_channel_handle chan = CPRCEN_engine_open_channel(m_pTTSEngine, "", "", sVoice.c_str(), "");
CPRCEN_engine_channel_reset(m_pTTSEngine, chan);
CPRCEN_engine_clear_callback(m_pTTSEngine, chan);
CPRCEN_engine_set_callback(m_pTTSEngine, chan, (void*) &visemes_lastTime, VisemeProcessing);
CPRCEN_engine_channel_to_file(m_pTTSEngine, chan, "D:/work/tts.wav", CPRCEN_RIFF); /* File output on channel */
CPRCEN_engine_channel_speak(m_pTTSEngine, chan, sText.c_str(), sText.length(), true);
visemes = visemes_lastTime.first;
visemes += "</speech>\n";
oRet["visemes"] = visemes;
oRet["ready_for_playback"] = true;
#endif
return oRet;
......@@ -100,6 +202,8 @@ void CVATextToSpeechSignalSource::SetParameters( const CVAStruct& oParams )
#ifndef VACORE_WITH_TTS_SIGNAL_SOURCE
VA_WARN( "CVATextToSpeechSignalSource", "TTS signal sources was requested but is not activated in your VACore" );
#else
if( oParams.HasKey( "start_talking" ) )
{
// Do something, if possible
......@@ -108,23 +212,209 @@ void CVATextToSpeechSignalSource::SetParameters( const CVAStruct& oParams )
if( oParams.HasKey( "prepare_speech" ) )
{
if( !oParams[ "prepare" ].IsString() )
VA_EXCEPT2( INVALID_PARAMETER, "Text of speech has be be a string" );
std::string sText = oParams[ "prepare" ];
// Prepare a WAV file or sample buffer with text-to-speech engine output
VA_WARN("CVATextToSpeechSignalSource", "For preparing speech GetParameters() must be used.");
return;
}
VA_WARN( "CVATextToSpeechSignalSource", "Could not interpret parameters for text-to-speech signal source setter method, use empty getter for help" );
VA_WARN( "CVATextToSpeechSignalSource", "Could not interpret parameters for text-to-speech signal source setter method, use empty getter for help." );
#endif
return;
}
#ifndef VACORE_WITH_TTS_SIGNAL_SOURCE
// put dummy methods here ... no so nice I know.
#else
// put rest here
std::string CVATextToSpeechSignalSource::to_string_with_precision(float a_value, const int n/* = 3*/) {
std::ostringstream out;
out << std::setprecision(n) << a_value;
return out.str();
}
void CVATextToSpeechSignalSource::SetupPhonemeMapping(){
if (m_phonemeToId.size() > 0)
return; //only initialize once
//this should support all phonemes, for American, Scottish and German speech (see https://www.cereproc.com/files/CereVoicePhoneSets.pdf)
m_phonemeToId["sil"] = 0;
m_phonemeToId["@"] = 6;
m_phonemeToId["@@"] = 6;
m_phonemeToId["a"] = 3;
m_phonemeToId["aa"] = 3;
m_phonemeToId["ae"] = 6;
m_phonemeToId["aeh"] = 6;
m_phonemeToId["ah"] = 6;
m_phonemeToId["ai"] = 11;
m_phonemeToId["an"] = 9;
m_phonemeToId["ao"] = 3;
m_phonemeToId["au"] = 3;
m_phonemeToId["aw"] = 6;
m_phonemeToId["ax"] = 6;
m_phonemeToId["ay"] = 6;
m_phonemeToId["b"] = 21;
m_phonemeToId["ch"] = 16;
m_phonemeToId["d"] = 19;
m_phonemeToId["dh"] = 17;
m_phonemeToId["dx"] = 19;
m_phonemeToId["dzh"] = 16; //not entirely clear
m_phonemeToId["e"] = 1;
m_phonemeToId["e@"] = 1;
m_phonemeToId["eh"] = 6;
m_phonemeToId["ei"] = 4;
m_phonemeToId["en"] = 4;
m_phonemeToId["er"] = 5;
m_phonemeToId["ey"] = 6;
m_phonemeToId["f"] = 18;
m_phonemeToId["g"] = 20;
m_phonemeToId["h"] = 12;
m_phonemeToId["hh"] = 6;
m_phonemeToId["i"] = 6;
m_phonemeToId["i@"] = 6;
m_phonemeToId["ih"] = 6;
m_phonemeToId["ii"] = 6;
m_phonemeToId["iy"] = 6;
m_phonemeToId["j"] = 6;
m_phonemeToId["jh"] = 16;
m_phonemeToId["k"] = 20;
m_phonemeToId["l"] = 19;
m_phonemeToId["m"] = 21;
m_phonemeToId["n"] = 20;
m_phonemeToId["ng"] = 20;
m_phonemeToId["o"] = 8;
m_phonemeToId["oe"] = 8; //actually no real oe viseme, so take o
m_phonemeToId["oeh"] = 8; //actually no real oe viseme, so take o
m_phonemeToId["oen"] = 21; //as in the German Parfum (p_a_rv_f_oen), just took m
m_phonemeToId["oh"] = 8;
m_phonemeToId["oi"] = 10;
m_phonemeToId["on"] = 9;
m_phonemeToId["oo"] = 8;
m_phonemeToId["ou"] = 8;
m_phonemeToId["ow"] = 8;
m_phonemeToId["oy"] = 8;
m_phonemeToId["p"] = 21;
m_phonemeToId["pf"] = 21;
m_phonemeToId["q"] = 1; //this is somehow only used in German, before starting as, e.g. in Abend (q_ah_b_@_n_t)
m_phonemeToId["r"] = 13;
m_phonemeToId["rv"] = 13;
m_phonemeToId["rl"] = 5;
m_phonemeToId["s"] = 15;
m_phonemeToId["sh"] = 16;
m_phonemeToId["T"] = 19;
m_phonemeToId["t"] = 19;
m_phonemeToId["th"] = 17;
m_phonemeToId["ts"] = 15;
m_phonemeToId["tsh"] = 16;
m_phonemeToId["u"] = 8;
m_phonemeToId["u@"] = 8;
m_phonemeToId["uh"] = 8;
m_phonemeToId["ue"] = 6; //no real ue
m_phonemeToId["ueh"] = 6; //no real ue
m_phonemeToId["uu"] = 8;
m_phonemeToId["uw"] = 8;
m_phonemeToId["v"] = 18;
m_phonemeToId["w"] = 7;
m_phonemeToId["x"] = 20; //actually not really supported, as in Scottish loch (l_o_x)
m_phonemeToId["y"] = 7;
m_phonemeToId["z"] = 15;
m_phonemeToId["zh"] = 16;
m_phonemeToId["R"] = 13;
m_idToViseme[0] = "_"; /// silence
m_idToViseme[1] = "Ah"; /// Viseme for aa, ae, ah
m_idToViseme[2] = "Aa"; /// Viseme for aa
m_idToViseme[3] = "Ao"; /// ao
m_idToViseme[4] = "Eh"; /// ey, eh, uh
m_idToViseme[5] = "Er"; /// er
m_idToViseme[6] = "Ih"; /// y, iy, ih, ix
m_idToViseme[7] = "W"; /// w, uw
m_idToViseme[8] = "Ow"; /// ow
m_idToViseme[9] = "Aw"; /// aw
m_idToViseme[10] = "Oy"; /// oy
m_idToViseme[11] = "Ay"; /// ay
m_idToViseme[12] = "H"; /// h
m_idToViseme[13] = "R"; /// r
m_idToViseme[14] = "L"; /// l
m_idToViseme[15] = "Z"; /// s, z
m_idToViseme[16] = "Sh"; /// sh, ch, jh, zh
m_idToViseme[17] = "Th"; /// th, dh
m_idToViseme[18] = "F"; /// f, v
m_idToViseme[19] = "D"; /// d, t, n - also try NG: 2 to 1 against
m_idToViseme[20] = "KG"; /// k, g, ,ng - also try NG: 2 to 1 against
m_idToViseme[21] = "BMP"; /// p, b, m
}
std::string CVATextToSpeechSignalSource::PhonemeToViseme(std::string phoneme) {
auto it = m_phonemeToId.find(phoneme);
if (it == m_phonemeToId.end()){
VA_WARN("CVATextToSpeechSignalSource", "There exists no mapping for the phoneme: \"" + phoneme + "\"");
return phoneme;
}
auto it2 = m_idToViseme.find(it->second);
if (it2 == m_idToViseme.end()){
VA_WARN("CVATextToSpeechSignalSource", "There exists no mapping for viseme id: "+it->second);
return phoneme;
}
return it2->second;
}
void CVATextToSpeechSignalSource::VisemeProcessing(CPRC_abuf * abuf, void * userdata) {
#ifdef VACORE_WITH_TTS_SIGNAL_SOURCE
//this callback is called per sentece in the text, so we need to append to the other visemes and also time-wise!
std::pair<std::string, float>* visemes_time = (std::pair<std::string, float>*) userdata;
float endTime = 0.0f;
if (abuf == NULL){
VA_WARN("CVATextToSpeechSignalSource", "The buffer is NULL, cannot extract visemes!");
return;
}
if (userdata == NULL){
VA_WARN("CVATextToSpeechSignalSource", "The userdata viseme string is NULL, cannot extract visemes!");
return;
}
/* Transcriptions contain markers, phonetic information, a
list of these items is available for each audio buffer. */
const CPRC_abuf_trans * trans;
std::string label;
float start, end;
/* Process the transcription buffer items and print information. */
for (int i = 0; i < CPRC_abuf_trans_sz(abuf); i++) {
trans = CPRC_abuf_get_trans(abuf, i);
start = CPRC_abuf_trans_start(trans); /* Start time in seconds */
end = CPRC_abuf_trans_end(trans); /* End time in seconds */
label = CPRC_abuf_trans_name(trans); /* Label, type dependent */
if (CPRC_abuf_trans_type(trans) == CPRC_ABUF_TRANS_PHONE) {
//VA_INFO("CVATextToSpeechSignalSource", "Phoneme: " + std::to_string(start) + " " + std::to_string(end) + " " + label);
//visemes_time->first.append( "\t<viseme start=\"" + to_string_with_precision(start+visemes_time->second) + "\" articulation=\"1\" type=\"" + PhonemeToViseme(label) + "\" />\n");
std::string viseme = std::string("\t<lips ")
+ "viseme=\"" + PhonemeToViseme(label) + "\" "
+ "articulation=\"1.0\" "
+ "start=\"" + to_string_with_precision(start + visemes_time->second) + "\" "
+ "ready=\"" + to_string_with_precision(start + visemes_time->second) + "\" "
+ "relax=\"" + to_string_with_precision(end + visemes_time->second) + "\" "
+ "end=\"" + to_string_with_precision(end + visemes_time->second) + "\" "
+ "/>\n";
visemes_time->first.append(viseme);
endTime = end;
}
/*else if (CPRC_abuf_trans_type(trans) == CPRC_ABUF_TRANS_WORD) {
VA_INFO("CVATextToSpeechSignalSource", "Word: " + std::to_string(start) + " " + std::to_string(end) + " " + label);
}
else if (CPRC_abuf_trans_type(trans) == CPRC_ABUF_TRANS_MARK) {
VA_INFO("CVATextToSpeechSignalSource", "Marker: " + std::to_string(start) + " " + std::to_string(end) + " " + label);
}*/
else if (CPRC_abuf_trans_type(trans) == CPRC_ABUF_TRANS_ERROR) {
VA_INFO("CVATextToSpeechSignalSource", "ERROR: could not retrieve transcription at "+ std::to_string(i));
}
}
visemes_time->second = endTime;
#endif
}
......@@ -9,6 +9,8 @@
#include <ITAAtomicPrimitives.h>
class CVACoreImpl;
class CPRCEN_engine;
class CPRC_abuf;
/** Text-to-speech signal source
*
......@@ -29,13 +31,53 @@ public:
void HandleRegistration( IVACore* );
void HandleUnregistration( IVACore* );
std::string GetStateString() const;
/*
* This should be used to start a prepared TTS using a CVAStruct with:
* ["play_speech"] = identificator of created speech
*/
void SetParameters( const CVAStruct& );
/*
* This can be used to prepare a TTS using a CVAStruct with:
* ["prepare_text"] = text to be spoken
* ["start_talking"] = true/false //whether it should be directly replayed
* ["voice"] = the voice to be used //if none is given or the one given cannot be found the standard voice is used (i.e. "Heather")
* and returns a CVAStruct with:
* ["ready_for_playback"] = true, if no error occured, false otherwise
* ["id"] = identificator that will be used for replying //see above
* ["visemes"] = viseme data for facial animation as xml string
*
* This can also be used to find the available voices using a CVAStruct with:
* ["list_voices"] = true
* and returns a CVAStruct with:
* ["number"] = the number of available voices
* ["voice_i"] = the name of the voice, where i is 0...["number"]-1
* ["sex_i"] = the sex of the voice, i.e. "male" or "female"; where i is 0...["number"]-1
* ["language_i"] = the language of the voice, e.g. "en" or "de"; where i is 0...["number"]-1
* ["country_i"] = the country of the voice, e.g. "GB" or "US"; where i is 0...["number"]-1
*/
CVAStruct GetParameters( const CVAStruct& ) const;
void Reset();
private:
static void VisemeProcessing(CPRC_abuf* abuf, void * userdata); //used as callback for the CereVoice engine
static std::string to_string_with_precision(float a_value, const int n = 3);
static std::string PhonemeToViseme(std::string phoneme);
void SetupPhonemeMapping();
IVACore* m_pAssociatedCore;
ITASampleBuffer m_sbOut;
/*The engine maintains the list of
loaded voices and makes them available to synthesis channels. */
CPRCEN_engine* m_pTTSEngine;
static std::map<std::string, int> m_phonemeToId;
static std::map<int, std::string> m_idToViseme;
};
#endif // IW_VA_TEXT_TO_SPEECH_SIGNAL_SOURCE
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment