Load phoneme_silence from voice config

pull/172/head
Michael Hansen 9 months ago
parent bd80cba1f3
commit d95dab3bb3

@ -32,14 +32,18 @@ Our goal is to support Home Assistant and the [Year of Voice](https://www.home-a
* Italian (it_IT)
* Georgian (ka_GE)
* Kazakh (kk_KZ)
* Luxembourgish (lb_LU)
* Nepali (ne_NP)
* Dutch (nl_BE, nl_NL)
* Norwegian (no_NO)
* Polish (pl_PL)
* Portuguese (pt_BR)
* Romanian (ro_RO)
* Russian (ru_RU)
* Serbian (sr_RS)
* Swedish (sv_SE)
* Swahili (sw_CD)
* Turkish (tr_TR)
* Ukrainian (uk_UA)
* Vietnamese (vi_VN)
* Chinese (zh_CN)
@ -81,6 +85,17 @@ For multi-speaker models, use `--speaker <number>` to change speakers (default:
See `piper --help` for more options.
### Streaming Audio
Piper can stream raw audio to stdout as its produced:
``` sh
echo 'This sentence is spoken first. This sentence is synthesized while the first sentence is spoken.' | \
./piper --model en_US-lessac-medium.onnx --output-raw | \
aplay -r 22050 -f S16_LE -t raw -
```
This is **raw** audio and not a WAV file, so make sure your audio player is set to play 16-bit mono PCM samples at the correct sample rate for the voice.
### JSON Input

@ -1 +1 @@
1.1.0
1.2.0

@ -189,7 +189,21 @@ int main(int argc, char *argv[]) {
runConfig.sentenceSilenceSeconds.value();
}
voice.synthesisConfig.phonemeSilenceSeconds = runConfig.phonemeSilenceSeconds;
if (runConfig.phonemeSilenceSeconds) {
if (!voice.synthesisConfig.phonemeSilenceSeconds) {
// Overwrite
voice.synthesisConfig.phonemeSilenceSeconds =
runConfig.phonemeSilenceSeconds;
} else {
// Merge
for (const auto &[phoneme, silenceSeconds] :
*runConfig.phonemeSilenceSeconds) {
voice.synthesisConfig.phonemeSilenceSeconds->try_emplace(
phoneme, silenceSeconds);
}
}
} // if phonemeSilenceSeconds
if (runConfig.outputType == OUTPUT_DIRECTORY) {
runConfig.outputPath = filesystem::absolute(runConfig.outputPath.value());

@ -140,7 +140,11 @@ void parseSynthesisConfig(json &configRoot, SynthesisConfig &synthesisConfig) {
// "inference": {
// "noise_scale": 0.667,
// "length_scale": 1,
// "noise_w": 0.8
// "noise_w": 0.8,
// "phoneme_silence": {
// "<phoneme>": <seconds of silence>,
// ...
// }
// }
// }
@ -166,7 +170,27 @@ void parseSynthesisConfig(json &configRoot, SynthesisConfig &synthesisConfig) {
if (inferenceValue.contains("noise_w")) {
synthesisConfig.noiseW = inferenceValue.value("noise_w", 0.8f);
}
}
if (inferenceValue.contains("phoneme_silence")) {
// phoneme -> seconds of silence to add after
synthesisConfig.phonemeSilenceSeconds.emplace();
auto phonemeSilenceValue = inferenceValue["phoneme_silence"];
for (auto &phonemeItem : phonemeSilenceValue.items()) {
std::string phonemeStr = phonemeItem.key();
if (!isSingleCodepoint(phonemeStr)) {
spdlog::error("\"{}\" is not a single codepoint", phonemeStr);
throw std::runtime_error(
"Phonemes must be one codepoint (phoneme silence)");
}
auto phoneme = getCodepoint(phonemeStr);
(*synthesisConfig.phonemeSilenceSeconds)[phoneme] =
phonemeItem.value().get<float>();
}
} // if phoneme_silence
} // if inference
} /* parseSynthesisConfig */

Loading…
Cancel
Save