mirror of https://github.com/rhasspy/piper
You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
156 lines
4.1 KiB
C++
156 lines
4.1 KiB
C++
#ifndef CONFIG_H_
|
|
#define CONFIG_H_
|
|
|
|
#include <filesystem>
|
|
#include <map>
|
|
#include <optional>
|
|
#include <set>
|
|
#include <stdexcept>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
#include "json.hpp"
|
|
#include "utf8.h"
|
|
|
|
using namespace std;
|
|
using json = nlohmann::json;
|
|
|
|
namespace piper {
|
|
|
|
typedef char32_t Phoneme;
|
|
typedef int64_t PhonemeId;
|
|
typedef int64_t SpeakerId;
|
|
|
|
const string DefaultVoice = "en-us";
|
|
|
|
enum eSpeakMode { Text, TextWithPhonemes, SSML };
|
|
|
|
struct eSpeakConfig {
|
|
string voice = DefaultVoice;
|
|
eSpeakMode mode = Text;
|
|
|
|
// Characters that eSpeak uses to break apart paragraphs/sentences
|
|
set<Phoneme> clauseBreakers{U'.', U'?', U'!', U',', U';', U':'};
|
|
|
|
Phoneme fullStop = U'.';
|
|
Phoneme comma = U',';
|
|
Phoneme question = U'?';
|
|
Phoneme exclamation = U'!';
|
|
};
|
|
|
|
struct PhonemizeConfig {
|
|
optional<map<Phoneme, vector<Phoneme>>> phonemeMap;
|
|
map<Phoneme, vector<PhonemeId>> phonemeIdMap;
|
|
|
|
PhonemeId idPad = 0; // padding (optionally interspersed)
|
|
PhonemeId idBos = 1; // beginning of sentence
|
|
PhonemeId idEos = 2; // end of sentence
|
|
bool interspersePad = true;
|
|
|
|
optional<eSpeakConfig> eSpeak;
|
|
};
|
|
|
|
struct SynthesisConfig {
|
|
float noiseScale = 0.667f;
|
|
float lengthScale = 1.0f;
|
|
float noiseW = 0.8f;
|
|
int sampleRate = 22050;
|
|
int sampleWidth = 2; // 16-bit
|
|
int channels = 1; // mono
|
|
optional<SpeakerId> speakerId;
|
|
float sentenceSilenceSeconds = 0.2f;
|
|
};
|
|
|
|
struct ModelConfig {
|
|
int numSpeakers;
|
|
};
|
|
|
|
bool isSingleCodepoint(string s) {
|
|
return utf8::distance(s.begin(), s.end()) == 1;
|
|
}
|
|
|
|
Phoneme getCodepoint(string s) {
|
|
utf8::iterator character_iter(s.begin(), s.begin(), s.end());
|
|
return *character_iter;
|
|
}
|
|
|
|
void parsePhonemizeConfig(json &configRoot, PhonemizeConfig &phonemizeConfig) {
|
|
|
|
if (configRoot.contains("espeak")) {
|
|
if (!phonemizeConfig.eSpeak) {
|
|
phonemizeConfig.eSpeak.emplace();
|
|
}
|
|
|
|
auto espeakValue = configRoot["espeak"];
|
|
if (espeakValue.contains("voice")) {
|
|
phonemizeConfig.eSpeak->voice = espeakValue["voice"].get<string>();
|
|
}
|
|
}
|
|
|
|
// phoneme to [phoneme] map
|
|
if (configRoot.contains("phoneme_map")) {
|
|
if (!phonemizeConfig.phonemeMap) {
|
|
phonemizeConfig.phonemeMap.emplace();
|
|
}
|
|
|
|
auto phonemeMapValue = configRoot["phoneme_map"];
|
|
for (auto &fromPhonemeItem : phonemeMapValue.items()) {
|
|
string fromPhoneme = fromPhonemeItem.key();
|
|
if (!isSingleCodepoint(fromPhoneme)) {
|
|
throw runtime_error("Phonemes must be one codepoint (phoneme map)");
|
|
}
|
|
|
|
auto fromCodepoint = getCodepoint(fromPhoneme);
|
|
for (auto &toPhonemeValue : fromPhonemeItem.value()) {
|
|
string toPhoneme = toPhonemeValue.get<string>();
|
|
if (!isSingleCodepoint(toPhoneme)) {
|
|
throw runtime_error("Phonemes must be one codepoint (phoneme map)");
|
|
}
|
|
|
|
auto toCodepoint = getCodepoint(toPhoneme);
|
|
(*phonemizeConfig.phonemeMap)[fromCodepoint].push_back(toCodepoint);
|
|
}
|
|
}
|
|
}
|
|
|
|
// phoneme to [id] map
|
|
if (configRoot.contains("phoneme_id_map")) {
|
|
auto phonemeIdMapValue = configRoot["phoneme_id_map"];
|
|
for (auto &fromPhonemeItem : phonemeIdMapValue.items()) {
|
|
string fromPhoneme = fromPhonemeItem.key();
|
|
if (!isSingleCodepoint(fromPhoneme)) {
|
|
throw runtime_error("Phonemes must be one codepoint (phoneme id map)");
|
|
}
|
|
|
|
auto fromCodepoint = getCodepoint(fromPhoneme);
|
|
for (auto &toIdValue : fromPhonemeItem.value()) {
|
|
PhonemeId toId = toIdValue.get<PhonemeId>();
|
|
phonemizeConfig.phonemeIdMap[fromCodepoint].push_back(toId);
|
|
}
|
|
}
|
|
}
|
|
|
|
} /* parsePhonemizeConfig */
|
|
|
|
void parseSynthesisConfig(json &configRoot, SynthesisConfig &synthesisConfig) {
|
|
|
|
if (configRoot.contains("audio")) {
|
|
auto audioValue = configRoot["audio"];
|
|
if (audioValue.contains("sample_rate")) {
|
|
// Default sample rate is 22050 Hz
|
|
synthesisConfig.sampleRate = audioValue.value("sample_rate", 22050);
|
|
}
|
|
}
|
|
|
|
} /* parseSynthesisConfig */
|
|
|
|
void parseModelConfig(json &configRoot, ModelConfig &modelConfig) {
|
|
|
|
modelConfig.numSpeakers = configRoot["num_speakers"].get<SpeakerId>();
|
|
|
|
} /* parseModelConfig */
|
|
|
|
} // namespace piper
|
|
|
|
#endif // CONFIG_H_
|