Use libtashkeel

pull/105/head
Michael Hansen 1 year ago
parent 555dd13679
commit 65bdade776

@ -6,7 +6,6 @@ piper:
mkdir -p build
cd build && cmake ../src/cpp -DCMAKE_BUILD_TYPE=Release && make
cp -aR $(LIB_DIR)/piper_phonemize/lib/espeak-ng-data $(LIB_DIR)/piper_phonemize/lib/*.so* build/
cp -a $(LIB_DIR)/onnxruntime/lib/*.so* build/
clean:
rm -rf build/ dist/

@ -14,7 +14,6 @@ string(APPEND CMAKE_CXX_FLAGS " -Wall -Wextra -Wl,-rpath,'$ORIGIN'")
string(APPEND CMAKE_C_FLAGS " -Wall -Wextra")
set(PIPER_PHONEMIZE_ROOTDIR ${CMAKE_CURRENT_LIST_DIR}/../../lib/${CMAKE_HOST_SYSTEM_NAME}-${CMAKE_HOST_SYSTEM_PROCESSOR}/piper_phonemize)
set(ONNXRUNTIME_ROOTDIR ${CMAKE_CURRENT_LIST_DIR}/../../lib/${CMAKE_HOST_SYSTEM_NAME}-${CMAKE_HOST_SYSTEM_PROCESSOR}/onnxruntime)
target_link_libraries(piper
piper_phonemize
@ -28,12 +27,10 @@ if(NOT APPLE)
endif()
target_link_directories(piper PUBLIC
${PIPER_PHONEMIZE_ROOTDIR}/lib
${ONNXRUNTIME_ROOTDIR}/lib)
${PIPER_PHONEMIZE_ROOTDIR}/lib)
target_include_directories(piper PUBLIC
${PIPER_PHONEMIZE_ROOTDIR}/include
${ONNXRUNTIME_ROOTDIR}/include
${SPDLOG_INCLUDE_DIRS})
target_compile_options(piper PUBLIC

@ -61,6 +61,10 @@ struct RunConfig {
// Path to espeak-ng data directory (default is next to piper executable)
optional<filesystem::path> eSpeakDataPath;
// Path to libtashkeel ort model
// https://github.com/mush42/libtashkeel/
optional<filesystem::path> tashkeelModelPath;
};
void parseArgs(int argc, char *argv[], RunConfig &runConfig);
@ -90,33 +94,34 @@ int main(int argc, char *argv[]) {
spdlog::info("Loaded voice in {} second(s)",
chrono::duration<double>(endTime - startTime).count());
if (voice.phonemizeConfig.phonemeType == piper::eSpeakPhonemes) {
spdlog::debug("Voice uses eSpeak phonemes ({})",
voice.phonemizeConfig.eSpeak->voice);
if (runConfig.eSpeakDataPath) {
// User provided path
piperConfig.eSpeakDataPath = runConfig.eSpeakDataPath.value().string();
} else {
// Get the path to the piper executable so we can locate espeak-ng-data
// next to it.
// Get the path to the piper executable so we can locate espeak-ng-data, etc.
// next to it.
#ifdef _MSC_VER
auto exePath = []() {
wchar_t moduleFileName[MAX_PATH] = {0};
GetModuleFileNameW(nullptr, moduleFileName, std::size(moduleFileName));
return filesystem::path(moduleFileName);
}();
auto exePath = []() {
wchar_t moduleFileName[MAX_PATH] = {0};
GetModuleFileNameW(nullptr, moduleFileName, std::size(moduleFileName));
return filesystem::path(moduleFileName);
}();
#elifdef __APPLE__
auto exePath = []() {
char moduleFileName[PATH_MAX] = { 0 };
char moduleFileName[PATH_MAX] = {0};
uint32_t moduleFileNameSize = std::size(moduleFileName);
_NSGetExecutablePath(moduleFileName, &moduleFileNameSize);
return filesystem::path(moduleFileName);
}();
#else
auto exePath = filesystem::canonical("/proc/self/exe");
auto exePath = filesystem::canonical("/proc/self/exe");
#endif
if (voice.phonemizeConfig.phonemeType == piper::eSpeakPhonemes) {
spdlog::debug("Voice uses eSpeak phonemes ({})",
voice.phonemizeConfig.eSpeak.voice);
if (runConfig.eSpeakDataPath) {
// User provided path
piperConfig.eSpeakDataPath = runConfig.eSpeakDataPath.value().string();
} else {
// Assume next to piper executable
piperConfig.eSpeakDataPath =
std::filesystem::absolute(
exePath.parent_path().append("espeak-ng-data"))
@ -130,6 +135,25 @@ int main(int argc, char *argv[]) {
piperConfig.useESpeak = false;
}
// Enable libtashkeel for Arabic
if (voice.phonemizeConfig.eSpeak.voice == "ar") {
piperConfig.useTashkeel = true;
if (runConfig.tashkeelModelPath) {
// User provided path
piperConfig.tashkeelModelPath =
runConfig.tashkeelModelPath.value().string();
} else {
// Assume next to piper executable
piperConfig.tashkeelModelPath =
std::filesystem::absolute(
exePath.parent_path().append("libtashkeel_model.ort"))
.string();
spdlog::debug("libtashkeel model is expected at {}",
piperConfig.tashkeelModelPath.value());
}
}
piper::initialize(piperConfig);
// Scales
@ -365,6 +389,9 @@ void parseArgs(int argc, char *argv[], RunConfig &runConfig) {
} else if (arg == "--espeak_data" || arg == "--espeak-data") {
ensureArg(argc, argv, i);
runConfig.eSpeakDataPath = filesystem::path(argv[++i]);
} else if (arg == "--tashkeel_model" || arg == "--tashkeel-model") {
ensureArg(argc, argv, i);
runConfig.tashkeelModelPath = filesystem::path(argv[++i]);
} else if (arg == "--debug") {
// Set DEBUG logging
spdlog::set_level(spdlog::level::debug);

@ -47,13 +47,9 @@ void parsePhonemizeConfig(json &configRoot, PhonemizeConfig &phonemizeConfig) {
// }
if (configRoot.contains("espeak")) {
if (!phonemizeConfig.eSpeak) {
phonemizeConfig.eSpeak.emplace();
}
auto espeakValue = configRoot["espeak"];
if (espeakValue.contains("voice")) {
phonemizeConfig.eSpeak->voice = espeakValue["voice"].get<std::string>();
phonemizeConfig.eSpeak.voice = espeakValue["voice"].get<std::string>();
}
}
@ -175,6 +171,22 @@ void initialize(PiperConfig &config) {
spdlog::debug("Initialized eSpeak");
}
// Load onnx model for libtashkeel
// https://github.com/mush42/libtashkeel/
if (config.useTashkeel) {
spdlog::debug("Using libtashkeel for diacritization");
if (!config.tashkeelModelPath) {
throw std::runtime_error("No path to libtashkeel model");
}
spdlog::debug("Loading libtashkeel model from {}",
config.tashkeelModelPath.value());
config.tashkeelState = std::make_unique<tashkeel::State>();
tashkeel::tashkeel_load(config.tashkeelModelPath.value(),
*config.tashkeelState);
spdlog::debug("Initialized libtashkeel");
}
spdlog::info("Initialized piper");
}
@ -368,6 +380,15 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text,
voice.synthesisConfig.sampleRate * voice.synthesisConfig.channels);
}
if (config.useTashkeel) {
if (!config.tashkeelState) {
throw std::runtime_error("Tashkeel model is not loaded");
}
spdlog::debug("Diacritizing text with libtashkeel: {}", text);
text = tashkeel::tashkeel_run(text, *config.tashkeelState);
}
// Phonemes for each sentence
spdlog::debug("Phonemizing text: {}", text);
std::vector<std::vector<Phoneme>> phonemes;
@ -375,7 +396,7 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text,
if (voice.phonemizeConfig.phonemeType == eSpeakPhonemes) {
// Use espeak-ng for phonemization
eSpeakPhonemeConfig eSpeakConfig;
eSpeakConfig.voice = voice.phonemizeConfig.eSpeak->voice;
eSpeakConfig.voice = voice.phonemizeConfig.eSpeak.voice;
phonemize_eSpeak(text, eSpeakConfig, phonemes);
} else {
// Use UTF-8 codepoints as "phonemes"
@ -405,7 +426,7 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text,
PhonemeIdConfig idConfig;
if (voice.phonemizeConfig.phonemeType == TextPhonemes) {
auto &language = voice.phonemizeConfig.eSpeak->voice;
auto &language = voice.phonemizeConfig.eSpeak.voice;
spdlog::debug("Text phoneme language: {}", language);
if (DEFAULT_ALPHABET.count(language) < 1) {
throw std::runtime_error(

@ -1,8 +1,8 @@
#ifndef PIPER_H_
#define PIPER_H_
#include <functional>
#include <fstream>
#include <functional>
#include <optional>
#include <string>
#include <vector>
@ -10,6 +10,7 @@
#include <onnxruntime_cxx_api.h>
#include <phoneme_ids.hpp>
#include <phonemize.hpp>
#include <tashkeel.hpp>
#include "json.hpp"
@ -26,6 +27,10 @@ struct eSpeakConfig {
struct PiperConfig {
std::string eSpeakDataPath;
bool useESpeak = true;
bool useTashkeel = false;
std::optional<std::string> tashkeelModelPath;
std::unique_ptr<tashkeel::State> tashkeelState;
};
enum PhonemeType { eSpeakPhonemes, TextPhonemes };
@ -40,7 +45,7 @@ struct PhonemizeConfig {
PhonemeId idEos = 2; // end of sentence
bool interspersePad = true;
std::optional<eSpeakConfig> eSpeak;
eSpeakConfig eSpeak;
};
struct SynthesisConfig {

Loading…
Cancel
Save