diff --git a/README.md b/README.md index f144049..9f88603 100644 --- a/README.md +++ b/README.md @@ -32,14 +32,18 @@ Our goal is to support Home Assistant and the [Year of Voice](https://www.home-a * Italian (it_IT) * Georgian (ka_GE) * Kazakh (kk_KZ) +* Luxembourgish (lb_LU) * Nepali (ne_NP) * Dutch (nl_BE, nl_NL) * Norwegian (no_NO) * Polish (pl_PL) * Portuguese (pt_BR) +* Romanian (ro_RO) * Russian (ru_RU) +* Serbian (sr_RS) * Swedish (sv_SE) * Swahili (sw_CD) +* Turkish (tr_TR) * Ukrainian (uk_UA) * Vietnamese (vi_VN) * Chinese (zh_CN) @@ -81,6 +85,17 @@ For multi-speaker models, use `--speaker ` to change speakers (default: See `piper --help` for more options. +### Streaming Audio + +Piper can stream raw audio to stdout as its produced: + +``` sh +echo 'This sentence is spoken first. This sentence is synthesized while the first sentence is spoken.' | \ + ./piper --model en_US-lessac-medium.onnx --output-raw | \ + aplay -r 22050 -f S16_LE -t raw - +``` + +This is **raw** audio and not a WAV file, so make sure your audio player is set to play 16-bit mono PCM samples at the correct sample rate for the voice. ### JSON Input diff --git a/VERSION b/VERSION index 9084fa2..26aaba0 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.1.0 +1.2.0 diff --git a/src/cpp/main.cpp b/src/cpp/main.cpp index aad42af..8972eac 100644 --- a/src/cpp/main.cpp +++ b/src/cpp/main.cpp @@ -189,7 +189,21 @@ int main(int argc, char *argv[]) { runConfig.sentenceSilenceSeconds.value(); } - voice.synthesisConfig.phonemeSilenceSeconds = runConfig.phonemeSilenceSeconds; + if (runConfig.phonemeSilenceSeconds) { + if (!voice.synthesisConfig.phonemeSilenceSeconds) { + // Overwrite + voice.synthesisConfig.phonemeSilenceSeconds = + runConfig.phonemeSilenceSeconds; + } else { + // Merge + for (const auto &[phoneme, silenceSeconds] : + *runConfig.phonemeSilenceSeconds) { + voice.synthesisConfig.phonemeSilenceSeconds->try_emplace( + phoneme, silenceSeconds); + } + } + + } // if phonemeSilenceSeconds if (runConfig.outputType == OUTPUT_DIRECTORY) { runConfig.outputPath = filesystem::absolute(runConfig.outputPath.value()); diff --git a/src/cpp/piper.cpp b/src/cpp/piper.cpp index 6da95cd..ef7eb49 100644 --- a/src/cpp/piper.cpp +++ b/src/cpp/piper.cpp @@ -140,7 +140,11 @@ void parseSynthesisConfig(json &configRoot, SynthesisConfig &synthesisConfig) { // "inference": { // "noise_scale": 0.667, // "length_scale": 1, - // "noise_w": 0.8 + // "noise_w": 0.8, + // "phoneme_silence": { + // "": , + // ... + // } // } // } @@ -166,7 +170,27 @@ void parseSynthesisConfig(json &configRoot, SynthesisConfig &synthesisConfig) { if (inferenceValue.contains("noise_w")) { synthesisConfig.noiseW = inferenceValue.value("noise_w", 0.8f); } - } + + if (inferenceValue.contains("phoneme_silence")) { + // phoneme -> seconds of silence to add after + synthesisConfig.phonemeSilenceSeconds.emplace(); + auto phonemeSilenceValue = inferenceValue["phoneme_silence"]; + for (auto &phonemeItem : phonemeSilenceValue.items()) { + std::string phonemeStr = phonemeItem.key(); + if (!isSingleCodepoint(phonemeStr)) { + spdlog::error("\"{}\" is not a single codepoint", phonemeStr); + throw std::runtime_error( + "Phonemes must be one codepoint (phoneme silence)"); + } + + auto phoneme = getCodepoint(phonemeStr); + (*synthesisConfig.phonemeSilenceSeconds)[phoneme] = + phonemeItem.value().get(); + } + + } // if phoneme_silence + + } // if inference } /* parseSynthesisConfig */