Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
Institute of Technical Acoustics (ITA)
VACore
Commits
c99f4e0d
Commit
c99f4e0d
authored
Jun 19, 2017
by
jwendt
Browse files
TTS implemented using correct resampling from 48kHz to 44.1kHz
parent
40065413
Changes
3
Hide whitespace changes
Inline
Side-by-side
CMakeLists.txt
View file @
c99f4e0d
...
...
@@ -316,6 +316,7 @@ add_definitions( "-DVACORE_CMAKE_DATE=\"${VACORE_CMAKE_DATE}\"" )
if
(
ITA_VACORE_WITH_TTS_SIGNAL_SOURCE
)
vista_use_package
(
CereVoice REQUIRED FIND_DEPENDENCIES
)
vista_use_package
(
samplerate REQUIRED FIND_DEPENDENCIES
)
if
(
VCEREVOICE_FOUND
)
add_definitions
(
"-DVACORE_WITH_TTS_SIGNAL_SOURCE"
)
endif
(
)
...
...
src/Audiosignals/VATextToSpeechSignalSource.cpp
View file @
c99f4e0d
...
...
@@ -19,10 +19,13 @@
#include
<VistaTools/VistaFileSystemDirectory.h>
#ifdef VACORE_WITH_TTS_SIGNAL_SOURCE
#include
<cerevoice_eng.h>
#endif
//#define TTS_USE_WAV
CVATextToSpeechSignalSource
::
CVATextToSpeechSignalSource
(
const
double
dSampleRate
,
const
int
iBlockLength
)
:
ITADatasourceRealization
(
1
,
dSampleRate
,
(
unsigned
int
)(
iBlockLength
))
...
...
@@ -94,7 +97,6 @@ const float* CVATextToSpeechSignalSource::GetStreamBlock( const CVAAudiostreamSt
// return pAudioFile->GetStreamBlock(pStreamInfo);
}
// @todo: take samples from generated WAV file
return
m_sbOut
.
data
();
}
...
...
@@ -209,14 +211,14 @@ void CVATextToSpeechSignalSource::SetParameters( const CVAStruct& oParams )
if
(
oParams
.
HasKey
(
"voice"
)
&&
oParams
[
"voice"
].
IsString
())
sVoice
=
oParams
[
"voice"
];
bool
direct_
re
play
=
false
;
if
(
oParams
.
HasKey
(
"direct_
re
play"
)
&&
oParams
[
"direct_
re
play"
].
IsBool
()
&&
oParams
[
"direct_
re
play"
]){
direct_
re
play
=
true
;
bool
direct_play
back
=
false
;
if
(
oParams
.
HasKey
(
"direct_play
back
"
)
&&
oParams
[
"direct_play
back
"
].
IsBool
()
&&
oParams
[
"direct_play
back
"
]){
direct_play
back
=
true
;
}
std
::
string
id
=
"tmp"
;
if
(
!
oParams
.
HasKey
(
"id"
)
||
!
oParams
[
"id"
].
IsString
()){
if
(
!
direct_
re
play
){
if
(
!
direct_play
back
){
VA_WARN
(
"CVATextToSpeechSignalSource"
,
"No id is given for the prepare speech request, the user application has to give an unique id."
);
return
;
}
...
...
@@ -242,23 +244,35 @@ void CVATextToSpeechSignalSource::SetParameters( const CVAStruct& oParams )
CPRCEN_engine_channel_reset
(
TTSEngine
::
getInstance
().
getEngine
(),
chan
);
CPRCEN_engine_clear_callback
(
TTSEngine
::
getInstance
().
getEngine
(),
chan
);
CPRCEN_engine_set_callback
(
TTSEngine
::
getInstance
().
getEngine
(),
chan
,
(
void
*
)
&
data
,
VisemeProcessing
);
//CPRCEN_engine_channel_to_file(TTSEngine::getInstance().getEngine(), chan, "D:/work/tts.wav", CPRCEN_RIFF); /* File output on channel */
#ifdef TTS_USE_WAV
CPRCEN_engine_channel_to_file
(
TTSEngine
::
getInstance
().
getEngine
(),
chan
,
"D:/work/tts.wav"
,
CPRCEN_RIFF
);
/* File output on channel */
#endif
CPRC_abuf
*
buf
=
CPRCEN_engine_channel_speak
(
TTSEngine
::
getInstance
().
getEngine
(),
chan
,
sText
.
c_str
(),
sText
.
length
(),
true
);
int
sRate
=
CPRC_abuf_wav_srate
(
buf
);
std
::
cout
<<
"CereVoice rate is: "
<<
sRate
<<
" while sound signal rate is: "
<<
GetSampleRate
()
<<
std
::
endl
;
data
.
visemes
+=
"</speech>
\n
"
;
data
.
visemes
+=
"<event id=
\"
"
+
id
+
"
\"
start=
\"
0
\"
message=
\"
speech started
\"
/>"
;
//
data.visemes += "<event id=\"" + id + "\" start=\"0\" message=\"speech started\" />";
m_Visemes
[
id
]
=
data
.
visemes
;
#ifdef TTS_USE_WAV
ITASampleFrame
*
pAudioBuffer
=
new
ITASampleFrame
(
"D:/work/tts.wav"
);
CITAAudioSample
*
pAudioSample
=
new
CITAAudioSample
();
//pAudioSample->Load(*pAudioBuffer, 44100.0f);
pAudioSample
->
Load
(
*
pAudioBuffer
,
TTSEngine
::
getInstance
().
getSampleRate
());
m_AudioSampleFrames
[
id
]
=
pAudioSample
;
#else
ITASampleFrame
*
pAudioBuffer
=
new
ITASampleFrame
();
pAudioBuffer
->
init
(
1
,
data
.
floatBuffer
.
size
(),
false
);
(
*
pAudioBuffer
)[
0
].
write
(
&
data
.
floatBuffer
[
0
],
data
.
floatBuffer
.
size
());
m_AudioSampleFrames
[
id
]
=
pAudioBuffer
;
if
(
direct_replay
){
CITAAudioSample
*
pAudioSample
=
new
CITAAudioSample
();
pAudioSample
->
Load
(
*
pAudioBuffer
,
TTSEngine
::
getInstance
().
getSampleRate
());
m_AudioSampleFrames
[
id
]
=
pAudioSample
;
#endif
if
(
direct_playback
){
CVAStruct
oParams_play
;
oParams_play
[
"play_speech"
]
=
id
;
oParams_play
[
"free_after"
]
=
true
;
...
...
@@ -305,6 +319,8 @@ void CVATextToSpeechSignalSource::SetParameters( const CVAStruct& oParams )
ITASampleFrame
*
pAudioBuffer
=
it
->
second
;
VA_INFO
(
"CVATextToSpeechSignalSource"
,
"GetSAmpleRAte: "
+
std
::
to_string
(
GetSampleRate
())
+
" engine::rate "
+
std
::
to_string
(
TTSEngine
::
getInstance
().
getSampleRate
()));
if
(
m_pBufferDataSource
!=
NULL
)
delete
m_pBufferDataSource
;
m_pBufferDataSource
=
new
ITABufferDatasource
((
*
pAudioBuffer
)[
0
].
data
(),
pAudioBuffer
->
length
(),
GetSampleRate
(),
GetBlocklength
());
...
...
@@ -426,6 +442,18 @@ CVATextToSpeechSignalSource::TTSEngine::TTSEngine(){
VA_INFO
(
"CVATextToSpeechSignalSource"
,
"Loaded voice
\"
"
+
name
+
"
\"
"
);
}
//now check for the sampled voices which sample rate we have
m_sampleRate
=
-
1.0
;
int
num_voices
=
CPRCEN_engine_get_voice_count
(
getEngine
());
for
(
int
i
=
0
;
i
<
num_voices
;
i
++
)
{
std
::
string
strSamplerate
=
CPRCEN_engine_get_voice_info
(
getEngine
(),
i
,
"SAMPLE_RATE"
);
float
rate
=
std
::
stof
(
strSamplerate
);
if
(
m_sampleRate
<
0.0
)
m_sampleRate
=
rate
;
if
(
rate
!=
m_sampleRate
)
VA_WARN
(
"CVATextToSpeechSignalSource"
,
"Voices with different sample rates are used namely "
+
std
::
to_string
(
rate
)
+
" and "
+
std
::
to_string
(
m_sampleRate
));
}
SetupPhonemeMapping
();
//std::string licence_file = voices_path + "cerevoice_heather_4.0.0_48k.lic";
...
...
@@ -447,6 +475,10 @@ CPRCEN_engine* CVATextToSpeechSignalSource::TTSEngine::getEngine() const
return
m_pTTSEngine
;
}
float
CVATextToSpeechSignalSource
::
TTSEngine
::
getSampleRate
()
const
{
return
m_sampleRate
;
}
std
::
string
CVATextToSpeechSignalSource
::
TTSEngine
::
PhonemeToViseme
(
std
::
string
phoneme
)
{
auto
it
=
m_phonemeToId
.
find
(
phoneme
);
...
...
src/Audiosignals/VATextToSpeechSignalSource.h
View file @
c99f4e0d
...
...
@@ -8,6 +8,7 @@
#include
<ITADataSourceRealization.h>
#include
<ITASampleBuffer.h>
#include
<ITAAudioSample.h>
#include
<ITAAtomicPrimitives.h>
class
ITABufferDatasource
;
...
...
@@ -15,6 +16,7 @@ class CVACoreImpl;
class
CPRCEN_engine
;
class
CPRC_abuf
;
/** Text-to-speech signal source
*
* The TTS signal source generates sound from text using external libraries, like TTSRelay for Windows platforms.
...
...
@@ -41,7 +43,7 @@ public:
* ["prepare_text"] = text to be spoken
* ["id"] = identificator that will be used for playing this speech and reference it (must be unique)
* ["voice"] = the voice to be used //if none is given or the one given cannot be found the standard voice is used (i.e. "Heather")
* ["direct_
re
play"] = true/false whether the audio should directly be played (in this case no id has to be given, should not be used for lipsyncing)
* ["direct_play
back
"] = true/false whether the audio should directly be played (in this case no id has to be given, should not be used for lipsyncing)
* This should be used to start a prepared TTS using a CVAStruct with:
* ["play_speech"] = identificator (int) of created speech
...
...
@@ -86,6 +88,7 @@ private:
void
operator
=
(
TTSEngine
const
&
)
=
delete
;
CPRCEN_engine
*
getEngine
()
const
;
float
getSampleRate
()
const
;
std
::
string
PhonemeToViseme
(
std
::
string
phoneme
);
private:
...
...
@@ -94,6 +97,7 @@ private:
CPRCEN_engine
*
m_pTTSEngine
;
//you must not delete this from outside!!!!!!
std
::
map
<
std
::
string
,
int
>
m_phonemeToId
;
std
::
map
<
int
,
std
::
string
>
m_idToViseme
;
float
m_sampleRate
;
};
struct
UserCallbackData
{
...
...
@@ -105,7 +109,7 @@ private:
static
void
VisemeProcessing
(
CPRC_abuf
*
abuf
,
void
*
userdata
);
//used as callback for the CereVoice engine
static
std
::
string
to_string_with_precision
(
float
a_value
,
const
int
n
=
3
);
std
::
map
<
std
::
string
,
ITASampl
eFram
e
*>
m_AudioSampleFrames
;
std
::
map
<
std
::
string
,
C
ITA
Audio
Sample
*>
m_AudioSampleFrames
;
std
::
map
<
std
::
string
,
std
::
string
>
m_Visemes
;
IVACore
*
m_pAssociatedCore
;
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment