mirror of https://github.com/rhasspy/piper
First working version with libpiper_phonemize
parent
7d27863b48
commit
810fad44cf
Binary file not shown.
@ -1,155 +0,0 @@
|
|||||||
#ifndef CONFIG_H_
|
|
||||||
#define CONFIG_H_
|
|
||||||
|
|
||||||
#include <filesystem>
|
|
||||||
#include <map>
|
|
||||||
#include <optional>
|
|
||||||
#include <set>
|
|
||||||
#include <stdexcept>
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#include "json.hpp"
|
|
||||||
#include "utf8.h"
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
using json = nlohmann::json;
|
|
||||||
|
|
||||||
namespace piper {
|
|
||||||
|
|
||||||
typedef char32_t Phoneme;
|
|
||||||
typedef int64_t PhonemeId;
|
|
||||||
typedef int64_t SpeakerId;
|
|
||||||
|
|
||||||
const string DefaultVoice = "en-us";
|
|
||||||
|
|
||||||
enum eSpeakMode { Text, TextWithPhonemes, SSML };
|
|
||||||
|
|
||||||
struct eSpeakConfig {
|
|
||||||
string voice = DefaultVoice;
|
|
||||||
eSpeakMode mode = Text;
|
|
||||||
|
|
||||||
// Characters that eSpeak uses to break apart paragraphs/sentences
|
|
||||||
set<Phoneme> clauseBreakers{U'.', U'?', U'!', U',', U';', U':'};
|
|
||||||
|
|
||||||
Phoneme fullStop = U'.';
|
|
||||||
Phoneme comma = U',';
|
|
||||||
Phoneme question = U'?';
|
|
||||||
Phoneme exclamation = U'!';
|
|
||||||
};
|
|
||||||
|
|
||||||
struct PhonemizeConfig {
|
|
||||||
optional<map<Phoneme, vector<Phoneme>>> phonemeMap;
|
|
||||||
map<Phoneme, vector<PhonemeId>> phonemeIdMap;
|
|
||||||
|
|
||||||
PhonemeId idPad = 0; // padding (optionally interspersed)
|
|
||||||
PhonemeId idBos = 1; // beginning of sentence
|
|
||||||
PhonemeId idEos = 2; // end of sentence
|
|
||||||
bool interspersePad = true;
|
|
||||||
|
|
||||||
optional<eSpeakConfig> eSpeak;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct SynthesisConfig {
|
|
||||||
float noiseScale = 0.667f;
|
|
||||||
float lengthScale = 1.0f;
|
|
||||||
float noiseW = 0.8f;
|
|
||||||
int sampleRate = 22050;
|
|
||||||
int sampleWidth = 2; // 16-bit
|
|
||||||
int channels = 1; // mono
|
|
||||||
optional<SpeakerId> speakerId;
|
|
||||||
float sentenceSilenceSeconds = 0.2f;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ModelConfig {
|
|
||||||
int numSpeakers;
|
|
||||||
};
|
|
||||||
|
|
||||||
bool isSingleCodepoint(string s) {
|
|
||||||
return utf8::distance(s.begin(), s.end()) == 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
Phoneme getCodepoint(string s) {
|
|
||||||
utf8::iterator character_iter(s.begin(), s.begin(), s.end());
|
|
||||||
return *character_iter;
|
|
||||||
}
|
|
||||||
|
|
||||||
void parsePhonemizeConfig(json &configRoot, PhonemizeConfig &phonemizeConfig) {
|
|
||||||
|
|
||||||
if (configRoot.contains("espeak")) {
|
|
||||||
if (!phonemizeConfig.eSpeak) {
|
|
||||||
phonemizeConfig.eSpeak.emplace();
|
|
||||||
}
|
|
||||||
|
|
||||||
auto espeakValue = configRoot["espeak"];
|
|
||||||
if (espeakValue.contains("voice")) {
|
|
||||||
phonemizeConfig.eSpeak->voice = espeakValue["voice"].get<string>();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// phoneme to [phoneme] map
|
|
||||||
if (configRoot.contains("phoneme_map")) {
|
|
||||||
if (!phonemizeConfig.phonemeMap) {
|
|
||||||
phonemizeConfig.phonemeMap.emplace();
|
|
||||||
}
|
|
||||||
|
|
||||||
auto phonemeMapValue = configRoot["phoneme_map"];
|
|
||||||
for (auto &fromPhonemeItem : phonemeMapValue.items()) {
|
|
||||||
string fromPhoneme = fromPhonemeItem.key();
|
|
||||||
if (!isSingleCodepoint(fromPhoneme)) {
|
|
||||||
throw runtime_error("Phonemes must be one codepoint (phoneme map)");
|
|
||||||
}
|
|
||||||
|
|
||||||
auto fromCodepoint = getCodepoint(fromPhoneme);
|
|
||||||
for (auto &toPhonemeValue : fromPhonemeItem.value()) {
|
|
||||||
string toPhoneme = toPhonemeValue.get<string>();
|
|
||||||
if (!isSingleCodepoint(toPhoneme)) {
|
|
||||||
throw runtime_error("Phonemes must be one codepoint (phoneme map)");
|
|
||||||
}
|
|
||||||
|
|
||||||
auto toCodepoint = getCodepoint(toPhoneme);
|
|
||||||
(*phonemizeConfig.phonemeMap)[fromCodepoint].push_back(toCodepoint);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// phoneme to [id] map
|
|
||||||
if (configRoot.contains("phoneme_id_map")) {
|
|
||||||
auto phonemeIdMapValue = configRoot["phoneme_id_map"];
|
|
||||||
for (auto &fromPhonemeItem : phonemeIdMapValue.items()) {
|
|
||||||
string fromPhoneme = fromPhonemeItem.key();
|
|
||||||
if (!isSingleCodepoint(fromPhoneme)) {
|
|
||||||
throw runtime_error("Phonemes must be one codepoint (phoneme id map)");
|
|
||||||
}
|
|
||||||
|
|
||||||
auto fromCodepoint = getCodepoint(fromPhoneme);
|
|
||||||
for (auto &toIdValue : fromPhonemeItem.value()) {
|
|
||||||
PhonemeId toId = toIdValue.get<PhonemeId>();
|
|
||||||
phonemizeConfig.phonemeIdMap[fromCodepoint].push_back(toId);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} /* parsePhonemizeConfig */
|
|
||||||
|
|
||||||
void parseSynthesisConfig(json &configRoot, SynthesisConfig &synthesisConfig) {
|
|
||||||
|
|
||||||
if (configRoot.contains("audio")) {
|
|
||||||
auto audioValue = configRoot["audio"];
|
|
||||||
if (audioValue.contains("sample_rate")) {
|
|
||||||
// Default sample rate is 22050 Hz
|
|
||||||
synthesisConfig.sampleRate = audioValue.value("sample_rate", 22050);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} /* parseSynthesisConfig */
|
|
||||||
|
|
||||||
void parseModelConfig(json &configRoot, ModelConfig &modelConfig) {
|
|
||||||
|
|
||||||
modelConfig.numSpeakers = configRoot["num_speakers"].get<SpeakerId>();
|
|
||||||
|
|
||||||
} /* parseModelConfig */
|
|
||||||
|
|
||||||
} // namespace piper
|
|
||||||
|
|
||||||
#endif // CONFIG_H_
|
|
@ -1,53 +0,0 @@
|
|||||||
#ifndef MODEL_H_
|
|
||||||
#define MODEL_H_
|
|
||||||
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
#include <onnxruntime_cxx_api.h>
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
namespace piper {
|
|
||||||
const string instanceName{"piper"};
|
|
||||||
|
|
||||||
struct ModelSession {
|
|
||||||
Ort::Session onnx;
|
|
||||||
Ort::AllocatorWithDefaultOptions allocator;
|
|
||||||
Ort::SessionOptions options;
|
|
||||||
Ort::Env env;
|
|
||||||
|
|
||||||
ModelSession() : onnx(nullptr){};
|
|
||||||
};
|
|
||||||
|
|
||||||
void loadModel(string modelPath, ModelSession &session) {
|
|
||||||
|
|
||||||
session.env = Ort::Env(OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING,
|
|
||||||
instanceName.c_str());
|
|
||||||
session.env.DisableTelemetryEvents();
|
|
||||||
|
|
||||||
// Slows down performance by ~2x
|
|
||||||
// session.options.SetIntraOpNumThreads(1);
|
|
||||||
|
|
||||||
// Roughly doubles load time for no visible inference benefit
|
|
||||||
// session.options.SetGraphOptimizationLevel(
|
|
||||||
// GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
|
|
||||||
|
|
||||||
session.options.SetGraphOptimizationLevel(
|
|
||||||
GraphOptimizationLevel::ORT_DISABLE_ALL);
|
|
||||||
|
|
||||||
// Slows down performance very slightly
|
|
||||||
// session.options.SetExecutionMode(ExecutionMode::ORT_PARALLEL);
|
|
||||||
|
|
||||||
session.options.DisableCpuMemArena();
|
|
||||||
session.options.DisableMemPattern();
|
|
||||||
session.options.DisableProfiling();
|
|
||||||
|
|
||||||
auto startTime = chrono::steady_clock::now();
|
|
||||||
session.onnx = Ort::Session(session.env, filesystem::path(modelPath).c_str(), session.options);
|
|
||||||
auto endTime = chrono::steady_clock::now();
|
|
||||||
auto loadDuration = chrono::duration<double>(endTime - startTime);
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace piper
|
|
||||||
|
|
||||||
#endif // MODEL_H_
|
|
@ -1,142 +0,0 @@
|
|||||||
#ifndef PHONEMIZE_H_
|
|
||||||
#define PHONEMIZE_H_
|
|
||||||
|
|
||||||
#include <filesystem>
|
|
||||||
#include <iostream>
|
|
||||||
#include <map>
|
|
||||||
#include <optional>
|
|
||||||
#include <set>
|
|
||||||
#include <stdexcept>
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#include <espeak-ng/speak_lib.h>
|
|
||||||
|
|
||||||
#include "config.hpp"
|
|
||||||
#include "utf8.h"
|
|
||||||
|
|
||||||
#define CLAUSE_INTONATION_FULL_STOP 0x00000000
|
|
||||||
#define CLAUSE_INTONATION_COMMA 0x00001000
|
|
||||||
#define CLAUSE_INTONATION_QUESTION 0x00002000
|
|
||||||
#define CLAUSE_INTONATION_EXCLAMATION 0x00003000
|
|
||||||
|
|
||||||
#define CLAUSE_TYPE_SENTENCE 0x00080000
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
namespace piper {
|
|
||||||
|
|
||||||
// Text to phonemes using eSpeak-ng
|
|
||||||
void phonemize(string text, PhonemizeConfig &phonemizeConfig,
|
|
||||||
vector<vector<Phoneme>> &phonemes) {
|
|
||||||
if (!phonemizeConfig.eSpeak) {
|
|
||||||
throw runtime_error("Missing eSpeak config");
|
|
||||||
}
|
|
||||||
|
|
||||||
auto voice = phonemizeConfig.eSpeak->voice;
|
|
||||||
int result = espeak_SetVoiceByName(voice.c_str());
|
|
||||||
if (result != 0) {
|
|
||||||
throw runtime_error("Failed to set eSpeak-ng voice");
|
|
||||||
}
|
|
||||||
|
|
||||||
// Modified by eSpeak
|
|
||||||
string textCopy(text);
|
|
||||||
|
|
||||||
utf8::iterator textIter(textCopy.begin(), textCopy.begin(), textCopy.end());
|
|
||||||
utf8::iterator textIterEnd(textCopy.end(), textCopy.begin(), textCopy.end());
|
|
||||||
vector<char32_t> textClauseBreakers;
|
|
||||||
|
|
||||||
// Identify clause breakers in the sentence, since eSpeak removes them during
|
|
||||||
// phonemization.
|
|
||||||
//
|
|
||||||
// This will unfortunately do the wrong thing with abbreviations, etc.
|
|
||||||
while (textIter != textIterEnd) {
|
|
||||||
auto codepoint = *textIter;
|
|
||||||
if (phonemizeConfig.eSpeak->clauseBreakers.contains(codepoint)) {
|
|
||||||
textClauseBreakers.push_back(codepoint);
|
|
||||||
}
|
|
||||||
|
|
||||||
textIter++;
|
|
||||||
}
|
|
||||||
|
|
||||||
vector<Phoneme> *sentencePhonemes = nullptr;
|
|
||||||
const char *inputTextPointer = textCopy.c_str();
|
|
||||||
int terminator = 0;
|
|
||||||
|
|
||||||
while (inputTextPointer != NULL) {
|
|
||||||
// Modified espeak-ng API to get access to clause terminator
|
|
||||||
string clausePhonemes(
|
|
||||||
espeak_TextToPhonemes2((const void **)&inputTextPointer,
|
|
||||||
/*textmode*/ espeakCHARS_AUTO,
|
|
||||||
/*phonememode = IPA*/ 0x02,
|
|
||||||
&terminator));
|
|
||||||
|
|
||||||
utf8::iterator phonemeIter(clausePhonemes.begin(), clausePhonemes.begin(),
|
|
||||||
clausePhonemes.end());
|
|
||||||
utf8::iterator phonemeEnd(clausePhonemes.end(), clausePhonemes.begin(),
|
|
||||||
clausePhonemes.end());
|
|
||||||
|
|
||||||
if (!sentencePhonemes) {
|
|
||||||
// Start new sentence
|
|
||||||
phonemes.emplace_back();
|
|
||||||
sentencePhonemes = &phonemes[phonemes.size() - 1];
|
|
||||||
}
|
|
||||||
|
|
||||||
sentencePhonemes->insert(sentencePhonemes->end(), phonemeIter, phonemeEnd);
|
|
||||||
|
|
||||||
// Add appropriate puntuation depending on terminator type
|
|
||||||
int intonation = terminator & 0x0000F000;
|
|
||||||
if (intonation == CLAUSE_INTONATION_FULL_STOP) {
|
|
||||||
sentencePhonemes->push_back(phonemizeConfig.eSpeak->fullStop);
|
|
||||||
} else if (intonation == CLAUSE_INTONATION_COMMA) {
|
|
||||||
sentencePhonemes->push_back(phonemizeConfig.eSpeak->comma);
|
|
||||||
} else if (intonation == CLAUSE_INTONATION_QUESTION) {
|
|
||||||
sentencePhonemes->push_back(phonemizeConfig.eSpeak->question);
|
|
||||||
} else if (intonation == CLAUSE_INTONATION_EXCLAMATION) {
|
|
||||||
sentencePhonemes->push_back(phonemizeConfig.eSpeak->exclamation);
|
|
||||||
}
|
|
||||||
|
|
||||||
if ((terminator & CLAUSE_TYPE_SENTENCE) == CLAUSE_TYPE_SENTENCE) {
|
|
||||||
// End of sentence
|
|
||||||
sentencePhonemes = nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // while inputTextPointer != NULL
|
|
||||||
|
|
||||||
} /* phonemize */
|
|
||||||
|
|
||||||
// Phonemes to ids using JSON map
|
|
||||||
void phonemes2ids(vector<Phoneme> &phonemes, PhonemizeConfig &phonemizeConfig,
|
|
||||||
vector<PhonemeId> &phonemeIds) {
|
|
||||||
if (phonemes.empty()) {
|
|
||||||
throw runtime_error("No phonemes");
|
|
||||||
}
|
|
||||||
|
|
||||||
phonemeIds.push_back(phonemizeConfig.idBos);
|
|
||||||
if (phonemizeConfig.interspersePad) {
|
|
||||||
phonemeIds.push_back(phonemizeConfig.idPad);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (auto phoneme = phonemes.begin(); phoneme != phonemes.end(); phoneme++) {
|
|
||||||
if (phonemizeConfig.phonemeIdMap.contains(*phoneme)) {
|
|
||||||
for (auto id : phonemizeConfig.phonemeIdMap[*phoneme]) {
|
|
||||||
phonemeIds.push_back(id);
|
|
||||||
|
|
||||||
if (phonemizeConfig.interspersePad) {
|
|
||||||
phonemeIds.push_back(phonemizeConfig.idPad);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
string phonemeStr;
|
|
||||||
utf8::append(*phoneme, phonemeStr);
|
|
||||||
cerr << "[WARN] No id for phoneme: " << phonemeStr << endl;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
phonemeIds.push_back(phonemizeConfig.idEos);
|
|
||||||
|
|
||||||
} /* phonemes2ids */
|
|
||||||
|
|
||||||
} // namespace piper
|
|
||||||
|
|
||||||
#endif // PHONEMIZE_H_
|
|
@ -0,0 +1,393 @@
|
|||||||
|
#include <array>
|
||||||
|
#include <chrono>
|
||||||
|
#include <fstream>
|
||||||
|
#include <limits>
|
||||||
|
#include <stdexcept>
|
||||||
|
|
||||||
|
#include <espeak-ng/speak_lib.h>
|
||||||
|
#include <onnxruntime_cxx_api.h>
|
||||||
|
|
||||||
|
#include "piper.hpp"
|
||||||
|
#include "utf8.h"
|
||||||
|
#include "wavfile.hpp"
|
||||||
|
|
||||||
|
namespace piper {
|
||||||
|
|
||||||
|
// Maximum value for 16-bit signed WAV sample
|
||||||
|
const float MAX_WAV_VALUE = 32767.0f;
|
||||||
|
|
||||||
|
const std::string instanceName{"piper"};
|
||||||
|
|
||||||
|
bool isSingleCodepoint(std::string s) {
|
||||||
|
return utf8::distance(s.begin(), s.end()) == 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
Phoneme getCodepoint(std::string s) {
|
||||||
|
utf8::iterator character_iter(s.begin(), s.begin(), s.end());
|
||||||
|
return *character_iter;
|
||||||
|
}
|
||||||
|
|
||||||
|
void parsePhonemizeConfig(json &configRoot, PhonemizeConfig &phonemizeConfig) {
|
||||||
|
|
||||||
|
if (configRoot.contains("espeak")) {
|
||||||
|
if (!phonemizeConfig.eSpeak) {
|
||||||
|
phonemizeConfig.eSpeak.emplace();
|
||||||
|
}
|
||||||
|
|
||||||
|
auto espeakValue = configRoot["espeak"];
|
||||||
|
if (espeakValue.contains("voice")) {
|
||||||
|
phonemizeConfig.eSpeak->voice = espeakValue["voice"].get<std::string>();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (configRoot.contains("phoneme_type")) {
|
||||||
|
auto phonemeTypeStr = configRoot["phoneme_type"].get<std::string>();
|
||||||
|
if (phonemeTypeStr == "text") {
|
||||||
|
phonemizeConfig.phonemeType = TextPhonemes;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// phoneme to [phoneme] map
|
||||||
|
if (configRoot.contains("phoneme_map")) {
|
||||||
|
if (!phonemizeConfig.phonemeMap) {
|
||||||
|
phonemizeConfig.phonemeMap.emplace();
|
||||||
|
}
|
||||||
|
|
||||||
|
auto phonemeMapValue = configRoot["phoneme_map"];
|
||||||
|
for (auto &fromPhonemeItem : phonemeMapValue.items()) {
|
||||||
|
std::string fromPhoneme = fromPhonemeItem.key();
|
||||||
|
if (!isSingleCodepoint(fromPhoneme)) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"Phonemes must be one codepoint (phoneme map)");
|
||||||
|
}
|
||||||
|
|
||||||
|
auto fromCodepoint = getCodepoint(fromPhoneme);
|
||||||
|
for (auto &toPhonemeValue : fromPhonemeItem.value()) {
|
||||||
|
std::string toPhoneme = toPhonemeValue.get<std::string>();
|
||||||
|
if (!isSingleCodepoint(toPhoneme)) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"Phonemes must be one codepoint (phoneme map)");
|
||||||
|
}
|
||||||
|
|
||||||
|
auto toCodepoint = getCodepoint(toPhoneme);
|
||||||
|
(*phonemizeConfig.phonemeMap)[fromCodepoint].push_back(toCodepoint);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// phoneme to [id] map
|
||||||
|
if (configRoot.contains("phoneme_id_map")) {
|
||||||
|
auto phonemeIdMapValue = configRoot["phoneme_id_map"];
|
||||||
|
for (auto &fromPhonemeItem : phonemeIdMapValue.items()) {
|
||||||
|
std::string fromPhoneme = fromPhonemeItem.key();
|
||||||
|
if (!isSingleCodepoint(fromPhoneme)) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"Phonemes must be one codepoint (phoneme id map)");
|
||||||
|
}
|
||||||
|
|
||||||
|
auto fromCodepoint = getCodepoint(fromPhoneme);
|
||||||
|
for (auto &toIdValue : fromPhonemeItem.value()) {
|
||||||
|
PhonemeId toId = toIdValue.get<PhonemeId>();
|
||||||
|
phonemizeConfig.phonemeIdMap[fromCodepoint].push_back(toId);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} /* parsePhonemizeConfig */
|
||||||
|
|
||||||
|
void parseSynthesisConfig(json &configRoot, SynthesisConfig &synthesisConfig) {
|
||||||
|
|
||||||
|
if (configRoot.contains("audio")) {
|
||||||
|
auto audioValue = configRoot["audio"];
|
||||||
|
if (audioValue.contains("sample_rate")) {
|
||||||
|
// Default sample rate is 22050 Hz
|
||||||
|
synthesisConfig.sampleRate = audioValue.value("sample_rate", 22050);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} /* parseSynthesisConfig */
|
||||||
|
|
||||||
|
void parseModelConfig(json &configRoot, ModelConfig &modelConfig) {
|
||||||
|
|
||||||
|
modelConfig.numSpeakers = configRoot["num_speakers"].get<SpeakerId>();
|
||||||
|
|
||||||
|
} /* parseModelConfig */
|
||||||
|
|
||||||
|
void initialize(PiperConfig &config) {
|
||||||
|
if (config.useESpeak) {
|
||||||
|
// Set up espeak-ng for calling espeak_TextToPhonemesWithTerminator
|
||||||
|
// See: https://github.com/rhasspy/espeak-ng
|
||||||
|
int result = espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS,
|
||||||
|
/*buflength*/ 0,
|
||||||
|
/*path*/ config.eSpeakDataPath.c_str(),
|
||||||
|
/*options*/ 0);
|
||||||
|
if (result < 0) {
|
||||||
|
throw std::runtime_error("Failed to initialize eSpeak-ng");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void terminate(PiperConfig &config) {
|
||||||
|
if (config.useESpeak) {
|
||||||
|
// Clean up espeak-ng
|
||||||
|
espeak_Terminate();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void loadModel(std::string modelPath, ModelSession &session) {
|
||||||
|
|
||||||
|
session.env = Ort::Env(OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING,
|
||||||
|
instanceName.c_str());
|
||||||
|
session.env.DisableTelemetryEvents();
|
||||||
|
|
||||||
|
// Slows down performance by ~2x
|
||||||
|
// session.options.SetIntraOpNumThreads(1);
|
||||||
|
|
||||||
|
// Roughly doubles load time for no visible inference benefit
|
||||||
|
// session.options.SetGraphOptimizationLevel(
|
||||||
|
// GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
|
||||||
|
|
||||||
|
session.options.SetGraphOptimizationLevel(
|
||||||
|
GraphOptimizationLevel::ORT_DISABLE_ALL);
|
||||||
|
|
||||||
|
// Slows down performance very slightly
|
||||||
|
// session.options.SetExecutionMode(ExecutionMode::ORT_PARALLEL);
|
||||||
|
|
||||||
|
session.options.DisableCpuMemArena();
|
||||||
|
session.options.DisableMemPattern();
|
||||||
|
session.options.DisableProfiling();
|
||||||
|
|
||||||
|
auto startTime = std::chrono::steady_clock::now();
|
||||||
|
session.onnx = Ort::Session(session.env, modelPath.c_str(), session.options);
|
||||||
|
auto endTime = std::chrono::steady_clock::now();
|
||||||
|
auto loadDuration = std::chrono::duration<double>(endTime - startTime);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Load Onnx model and JSON config file
|
||||||
|
void loadVoice(PiperConfig &config, std::string modelPath,
|
||||||
|
std::string modelConfigPath, Voice &voice,
|
||||||
|
std::optional<SpeakerId> &speakerId) {
|
||||||
|
std::ifstream modelConfigFile(modelConfigPath);
|
||||||
|
voice.configRoot = json::parse(modelConfigFile);
|
||||||
|
|
||||||
|
parsePhonemizeConfig(voice.configRoot, voice.phonemizeConfig);
|
||||||
|
parseSynthesisConfig(voice.configRoot, voice.synthesisConfig);
|
||||||
|
parseModelConfig(voice.configRoot, voice.modelConfig);
|
||||||
|
|
||||||
|
if (voice.modelConfig.numSpeakers > 1) {
|
||||||
|
// Multi-speaker model
|
||||||
|
if (speakerId) {
|
||||||
|
voice.synthesisConfig.speakerId = speakerId;
|
||||||
|
} else {
|
||||||
|
// Default speaker
|
||||||
|
voice.synthesisConfig.speakerId = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
loadModel(modelPath, voice.session);
|
||||||
|
|
||||||
|
} /* loadVoice */
|
||||||
|
|
||||||
|
// Phoneme ids to WAV audio
|
||||||
|
void synthesize(std::vector<PhonemeId> &phonemeIds,
|
||||||
|
SynthesisConfig &synthesisConfig, ModelSession &session,
|
||||||
|
std::vector<int16_t> &audioBuffer, SynthesisResult &result) {
|
||||||
|
auto memoryInfo = Ort::MemoryInfo::CreateCpu(
|
||||||
|
OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
|
||||||
|
|
||||||
|
// Allocate
|
||||||
|
std::vector<int64_t> phonemeIdLengths{(int64_t)phonemeIds.size()};
|
||||||
|
std::vector<float> scales{synthesisConfig.noiseScale,
|
||||||
|
synthesisConfig.lengthScale,
|
||||||
|
synthesisConfig.noiseW};
|
||||||
|
|
||||||
|
std::vector<Ort::Value> inputTensors;
|
||||||
|
std::vector<int64_t> phonemeIdsShape{1, (int64_t)phonemeIds.size()};
|
||||||
|
inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
|
||||||
|
memoryInfo, phonemeIds.data(), phonemeIds.size(), phonemeIdsShape.data(),
|
||||||
|
phonemeIdsShape.size()));
|
||||||
|
|
||||||
|
std::vector<int64_t> phomemeIdLengthsShape{(int64_t)phonemeIdLengths.size()};
|
||||||
|
inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
|
||||||
|
memoryInfo, phonemeIdLengths.data(), phonemeIdLengths.size(),
|
||||||
|
phomemeIdLengthsShape.data(), phomemeIdLengthsShape.size()));
|
||||||
|
|
||||||
|
std::vector<int64_t> scalesShape{(int64_t)scales.size()};
|
||||||
|
inputTensors.push_back(
|
||||||
|
Ort::Value::CreateTensor<float>(memoryInfo, scales.data(), scales.size(),
|
||||||
|
scalesShape.data(), scalesShape.size()));
|
||||||
|
|
||||||
|
// Add speaker id.
|
||||||
|
// NOTE: These must be kept outside the "if" below to avoid being deallocated.
|
||||||
|
std::vector<int64_t> speakerId{
|
||||||
|
(int64_t)synthesisConfig.speakerId.value_or(0)};
|
||||||
|
std::vector<int64_t> speakerIdShape{(int64_t)speakerId.size()};
|
||||||
|
|
||||||
|
if (synthesisConfig.speakerId) {
|
||||||
|
inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
|
||||||
|
memoryInfo, speakerId.data(), speakerId.size(), speakerIdShape.data(),
|
||||||
|
speakerIdShape.size()));
|
||||||
|
}
|
||||||
|
|
||||||
|
// From export_onnx.py
|
||||||
|
std::array<const char *, 4> inputNames = {"input", "input_lengths", "scales",
|
||||||
|
"sid"};
|
||||||
|
std::array<const char *, 1> outputNames = {"output"};
|
||||||
|
|
||||||
|
// Infer
|
||||||
|
auto startTime = std::chrono::steady_clock::now();
|
||||||
|
auto outputTensors = session.onnx.Run(
|
||||||
|
Ort::RunOptions{nullptr}, inputNames.data(), inputTensors.data(),
|
||||||
|
inputTensors.size(), outputNames.data(), outputNames.size());
|
||||||
|
auto endTime = std::chrono::steady_clock::now();
|
||||||
|
|
||||||
|
if ((outputTensors.size() != 1) || (!outputTensors.front().IsTensor())) {
|
||||||
|
throw std::runtime_error("Invalid output tensors");
|
||||||
|
}
|
||||||
|
auto inferDuration = std::chrono::duration<double>(endTime - startTime);
|
||||||
|
result.inferSeconds = inferDuration.count();
|
||||||
|
|
||||||
|
const float *audio = outputTensors.front().GetTensorData<float>();
|
||||||
|
auto audioShape =
|
||||||
|
outputTensors.front().GetTensorTypeAndShapeInfo().GetShape();
|
||||||
|
int64_t audioCount = audioShape[audioShape.size() - 1];
|
||||||
|
|
||||||
|
result.audioSeconds = (double)audioCount / (double)synthesisConfig.sampleRate;
|
||||||
|
result.realTimeFactor = 0.0;
|
||||||
|
if (result.audioSeconds > 0) {
|
||||||
|
result.realTimeFactor = result.inferSeconds / result.audioSeconds;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get max audio value for scaling
|
||||||
|
float maxAudioValue = 0.01f;
|
||||||
|
for (int64_t i = 0; i < audioCount; i++) {
|
||||||
|
float audioValue = abs(audio[i]);
|
||||||
|
if (audioValue > maxAudioValue) {
|
||||||
|
maxAudioValue = audioValue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// We know the size up front
|
||||||
|
audioBuffer.reserve(audioCount);
|
||||||
|
|
||||||
|
// Scale audio to fill range and convert to int16
|
||||||
|
float audioScale = (MAX_WAV_VALUE / std::max(0.01f, maxAudioValue));
|
||||||
|
for (int64_t i = 0; i < audioCount; i++) {
|
||||||
|
int16_t intAudioValue = static_cast<int16_t>(
|
||||||
|
std::clamp(audio[i] * audioScale,
|
||||||
|
static_cast<float>(std::numeric_limits<int16_t>::min()),
|
||||||
|
static_cast<float>(std::numeric_limits<int16_t>::max())));
|
||||||
|
|
||||||
|
audioBuffer.push_back(intAudioValue);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Clean up
|
||||||
|
for (std::size_t i = 0; i < outputTensors.size(); i++) {
|
||||||
|
Ort::detail::OrtRelease(outputTensors[i].release());
|
||||||
|
}
|
||||||
|
|
||||||
|
for (std::size_t i = 0; i < inputTensors.size(); i++) {
|
||||||
|
Ort::detail::OrtRelease(inputTensors[i].release());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
// Phonemize text and synthesize audio
|
||||||
|
void textToAudio(PiperConfig &config, Voice &voice, std::string text,
|
||||||
|
std::vector<int16_t> &audioBuffer, SynthesisResult &result,
|
||||||
|
const std::function<void()> &audioCallback) {
|
||||||
|
|
||||||
|
std::size_t sentenceSilenceSamples = 0;
|
||||||
|
if (voice.synthesisConfig.sentenceSilenceSeconds > 0) {
|
||||||
|
sentenceSilenceSamples = (std::size_t)(
|
||||||
|
voice.synthesisConfig.sentenceSilenceSeconds *
|
||||||
|
voice.synthesisConfig.sampleRate * voice.synthesisConfig.channels);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Phonemes for each sentence
|
||||||
|
std::vector<std::vector<Phoneme>> phonemes;
|
||||||
|
|
||||||
|
if (voice.phonemizeConfig.phonemeType == eSpeakPhonemes) {
|
||||||
|
// Use espeak-ng for phonemization
|
||||||
|
eSpeakPhonemeConfig eSpeakConfig;
|
||||||
|
eSpeakConfig.voice = voice.phonemizeConfig.eSpeak->voice;
|
||||||
|
phonemize_eSpeak(text, eSpeakConfig, phonemes);
|
||||||
|
} else {
|
||||||
|
// Use UTF-8 codepoints as "phonemes"
|
||||||
|
CodepointsPhonemeConfig codepointsConfig;
|
||||||
|
phonemize_codepoints(text, codepointsConfig, phonemes);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Synthesize each sentence independently.
|
||||||
|
std::vector<PhonemeId> phonemeIds;
|
||||||
|
std::map<Phoneme, std::size_t> missingPhonemes;
|
||||||
|
for (auto phonemesIter = phonemes.begin(); phonemesIter != phonemes.end();
|
||||||
|
++phonemesIter) {
|
||||||
|
std::vector<Phoneme> &sentencePhonemes = *phonemesIter;
|
||||||
|
SynthesisResult sentenceResult;
|
||||||
|
|
||||||
|
PhonemeIdConfig idConfig;
|
||||||
|
if (voice.phonemizeConfig.phonemeType == TextPhonemes) {
|
||||||
|
auto &language = voice.phonemizeConfig.eSpeak->voice;
|
||||||
|
if (DEFAULT_ALPHABET.count(language) < 1) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"Text phoneme language for voice is not supported");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use alphabet for language
|
||||||
|
idConfig.phonemeIdMap =
|
||||||
|
std::make_shared<PhonemeIdMap>(DEFAULT_ALPHABET[language]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// phonemes -> ids
|
||||||
|
phonemes_to_ids(sentencePhonemes, idConfig, phonemeIds, missingPhonemes);
|
||||||
|
|
||||||
|
// ids -> audio
|
||||||
|
synthesize(phonemeIds, voice.synthesisConfig, voice.session, audioBuffer,
|
||||||
|
sentenceResult);
|
||||||
|
|
||||||
|
// Add end of sentence silence
|
||||||
|
if (sentenceSilenceSamples > 0) {
|
||||||
|
for (std::size_t i = 0; i < sentenceSilenceSamples; i++) {
|
||||||
|
audioBuffer.push_back(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (audioCallback) {
|
||||||
|
// Call back must copy audio since it is cleared afterwards.
|
||||||
|
audioCallback();
|
||||||
|
audioBuffer.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
result.audioSeconds += sentenceResult.audioSeconds;
|
||||||
|
result.inferSeconds += sentenceResult.inferSeconds;
|
||||||
|
|
||||||
|
phonemeIds.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (result.audioSeconds > 0) {
|
||||||
|
result.realTimeFactor = result.inferSeconds / result.audioSeconds;
|
||||||
|
}
|
||||||
|
|
||||||
|
} /* textToAudio */
|
||||||
|
|
||||||
|
// Phonemize text and synthesize audio to WAV file
|
||||||
|
void textToWavFile(PiperConfig &config, Voice &voice, std::string text,
|
||||||
|
std::ostream &audioFile, SynthesisResult &result) {
|
||||||
|
|
||||||
|
std::vector<int16_t> audioBuffer;
|
||||||
|
textToAudio(config, voice, text, audioBuffer, result, NULL);
|
||||||
|
|
||||||
|
// Write WAV
|
||||||
|
auto synthesisConfig = voice.synthesisConfig;
|
||||||
|
writeWavHeader(synthesisConfig.sampleRate, synthesisConfig.sampleWidth,
|
||||||
|
synthesisConfig.channels, (int32_t)audioBuffer.size(),
|
||||||
|
audioFile);
|
||||||
|
|
||||||
|
audioFile.write((const char *)audioBuffer.data(),
|
||||||
|
sizeof(int16_t) * audioBuffer.size());
|
||||||
|
|
||||||
|
} /* textToWavFile */
|
||||||
|
|
||||||
|
} // namespace piper
|
@ -1,130 +0,0 @@
|
|||||||
#ifndef SYNTHESIZE_H_
|
|
||||||
#define SYNTHESIZE_H_
|
|
||||||
|
|
||||||
#include <array>
|
|
||||||
#include <chrono>
|
|
||||||
#include <limits>
|
|
||||||
#include <memory>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#include <onnxruntime_cxx_api.h>
|
|
||||||
|
|
||||||
#include "config.hpp"
|
|
||||||
#include "model.hpp"
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
namespace piper {
|
|
||||||
|
|
||||||
// Maximum value for 16-bit signed WAV sample
|
|
||||||
const float MAX_WAV_VALUE = 32767.0f;
|
|
||||||
|
|
||||||
struct SynthesisResult {
|
|
||||||
double inferSeconds;
|
|
||||||
double audioSeconds;
|
|
||||||
double realTimeFactor;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Phoneme ids to WAV audio
|
|
||||||
void synthesize(vector<PhonemeId> &phonemeIds, SynthesisConfig &synthesisConfig,
|
|
||||||
ModelSession &session, vector<int16_t> &audioBuffer,
|
|
||||||
SynthesisResult &result) {
|
|
||||||
auto memoryInfo = Ort::MemoryInfo::CreateCpu(
|
|
||||||
OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
|
|
||||||
|
|
||||||
// Allocate
|
|
||||||
vector<int64_t> phonemeIdLengths{(int64_t)phonemeIds.size()};
|
|
||||||
vector<float> scales{synthesisConfig.noiseScale, synthesisConfig.lengthScale,
|
|
||||||
synthesisConfig.noiseW};
|
|
||||||
|
|
||||||
vector<Ort::Value> inputTensors;
|
|
||||||
vector<int64_t> phonemeIdsShape{1, (int64_t)phonemeIds.size()};
|
|
||||||
inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
|
|
||||||
memoryInfo, phonemeIds.data(), phonemeIds.size(), phonemeIdsShape.data(),
|
|
||||||
phonemeIdsShape.size()));
|
|
||||||
|
|
||||||
vector<int64_t> phomemeIdLengthsShape{(int64_t)phonemeIdLengths.size()};
|
|
||||||
inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
|
|
||||||
memoryInfo, phonemeIdLengths.data(), phonemeIdLengths.size(),
|
|
||||||
phomemeIdLengthsShape.data(), phomemeIdLengthsShape.size()));
|
|
||||||
|
|
||||||
vector<int64_t> scalesShape{(int64_t)scales.size()};
|
|
||||||
inputTensors.push_back(
|
|
||||||
Ort::Value::CreateTensor<float>(memoryInfo, scales.data(), scales.size(),
|
|
||||||
scalesShape.data(), scalesShape.size()));
|
|
||||||
|
|
||||||
// Add speaker id.
|
|
||||||
// NOTE: These must be kept outside the "if" below to avoid being deallocated.
|
|
||||||
vector<int64_t> speakerId{(int64_t)synthesisConfig.speakerId.value_or(0)};
|
|
||||||
vector<int64_t> speakerIdShape{(int64_t)speakerId.size()};
|
|
||||||
|
|
||||||
if (synthesisConfig.speakerId) {
|
|
||||||
inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
|
|
||||||
memoryInfo, speakerId.data(), speakerId.size(), speakerIdShape.data(),
|
|
||||||
speakerIdShape.size()));
|
|
||||||
}
|
|
||||||
|
|
||||||
// From export_onnx.py
|
|
||||||
array<const char *, 4> inputNames = {"input", "input_lengths", "scales",
|
|
||||||
"sid"};
|
|
||||||
array<const char *, 1> outputNames = {"output"};
|
|
||||||
|
|
||||||
// Infer
|
|
||||||
auto startTime = chrono::steady_clock::now();
|
|
||||||
auto outputTensors = session.onnx.Run(
|
|
||||||
Ort::RunOptions{nullptr}, inputNames.data(), inputTensors.data(),
|
|
||||||
inputTensors.size(), outputNames.data(), outputNames.size());
|
|
||||||
auto endTime = chrono::steady_clock::now();
|
|
||||||
|
|
||||||
if ((outputTensors.size() != 1) || (!outputTensors.front().IsTensor())) {
|
|
||||||
throw runtime_error("Invalid output tensors");
|
|
||||||
}
|
|
||||||
auto inferDuration = chrono::duration<double>(endTime - startTime);
|
|
||||||
result.inferSeconds = inferDuration.count();
|
|
||||||
|
|
||||||
const float *audio = outputTensors.front().GetTensorData<float>();
|
|
||||||
auto audioShape =
|
|
||||||
outputTensors.front().GetTensorTypeAndShapeInfo().GetShape();
|
|
||||||
int64_t audioCount = audioShape[audioShape.size() - 1];
|
|
||||||
|
|
||||||
result.audioSeconds = (double)audioCount / (double)synthesisConfig.sampleRate;
|
|
||||||
result.realTimeFactor = 0.0;
|
|
||||||
if (result.audioSeconds > 0) {
|
|
||||||
result.realTimeFactor = result.inferSeconds / result.audioSeconds;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get max audio value for scaling
|
|
||||||
float maxAudioValue = 0.01f;
|
|
||||||
for (int64_t i = 0; i < audioCount; i++) {
|
|
||||||
float audioValue = abs(audio[i]);
|
|
||||||
if (audioValue > maxAudioValue) {
|
|
||||||
maxAudioValue = audioValue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// We know the size up front
|
|
||||||
audioBuffer.reserve(audioCount);
|
|
||||||
|
|
||||||
// Scale audio to fill range and convert to int16
|
|
||||||
float audioScale = (MAX_WAV_VALUE / max(0.01f, maxAudioValue));
|
|
||||||
for (int64_t i = 0; i < audioCount; i++) {
|
|
||||||
int16_t intAudioValue = static_cast<int16_t>(
|
|
||||||
clamp(audio[i] * audioScale,
|
|
||||||
static_cast<float>(numeric_limits<int16_t>::min()),
|
|
||||||
static_cast<float>(numeric_limits<int16_t>::max())));
|
|
||||||
|
|
||||||
audioBuffer.push_back(intAudioValue);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Clean up
|
|
||||||
for (size_t i = 0; i < outputTensors.size(); i++) {
|
|
||||||
Ort::detail::OrtRelease(outputTensors[i].release());
|
|
||||||
}
|
|
||||||
|
|
||||||
for (size_t i = 0; i < inputTensors.size(); i++) {
|
|
||||||
Ort::detail::OrtRelease(inputTensors[i].release());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} // namespace piper
|
|
||||||
|
|
||||||
#endif // SYNTHESIZE_H_
|
|
Loading…
Reference in New Issue