Use libtashkeel

1 year ago · 65bdade776
parent 555dd13679
commit 65bdade776
5 changed files with 80 additions and 31 deletions
--- a/1
+++ b/1
@ -6,7 +6,6 @@ piper:
 	mkdir -p build
 	cd build && cmake ../src/cpp -DCMAKE_BUILD_TYPE=Release && make
 	cp -aR $(LIB_DIR)/piper_phonemize/lib/espeak-ng-data $(LIB_DIR)/piper_phonemize/lib/*.so* build/
-	cp -a $(LIB_DIR)/onnxruntime/lib/*.so* build/

 clean:
 	rm -rf build/ dist/
--- a/src/cpp/CMakeLists.txt
+++ b/src/cpp/CMakeLists.txt
@ -14,7 +14,6 @@ string(APPEND CMAKE_CXX_FLAGS " -Wall -Wextra -Wl,-rpath,'$ORIGIN'")
 string(APPEND CMAKE_C_FLAGS " -Wall -Wextra")

 set(PIPER_PHONEMIZE_ROOTDIR ${CMAKE_CURRENT_LIST_DIR}/../../lib/${CMAKE_HOST_SYSTEM_NAME}-${CMAKE_HOST_SYSTEM_PROCESSOR}/piper_phonemize)
-set(ONNXRUNTIME_ROOTDIR ${CMAKE_CURRENT_LIST_DIR}/../../lib/${CMAKE_HOST_SYSTEM_NAME}-${CMAKE_HOST_SYSTEM_PROCESSOR}/onnxruntime)

 target_link_libraries(piper
                      piper_phonemize
@ -28,12 +27,10 @@ if(NOT APPLE)
 endif()

 target_link_directories(piper PUBLIC
-                        ${PIPER_PHONEMIZE_ROOTDIR}/lib
-                        ${ONNXRUNTIME_ROOTDIR}/lib)
+                        ${PIPER_PHONEMIZE_ROOTDIR}/lib)

 target_include_directories(piper PUBLIC
                           ${PIPER_PHONEMIZE_ROOTDIR}/include
-                           ${ONNXRUNTIME_ROOTDIR}/include
                           ${SPDLOG_INCLUDE_DIRS})

 target_compile_options(piper PUBLIC
--- a/src/cpp/main.cpp
+++ b/src/cpp/main.cpp
@ -61,6 +61,10 @@ struct RunConfig {

  // Path to espeak-ng data directory (default is next to piper executable)
  optional<filesystem::path> eSpeakDataPath;
+
+  // Path to libtashkeel ort model
+  // https://github.com/mush42/libtashkeel/
+  optional<filesystem::path> tashkeelModelPath;
 };

 void parseArgs(int argc, char *argv[], RunConfig &runConfig);
@ -90,33 +94,34 @@ int main(int argc, char *argv[]) {
  spdlog::info("Loaded voice in {} second(s)",
               chrono::duration<double>(endTime - startTime).count());

-  if (voice.phonemizeConfig.phonemeType == piper::eSpeakPhonemes) {
-    spdlog::debug("Voice uses eSpeak phonemes ({})",
-                  voice.phonemizeConfig.eSpeak->voice);
-
-    if (runConfig.eSpeakDataPath) {
-      // User provided path
-      piperConfig.eSpeakDataPath = runConfig.eSpeakDataPath.value().string();
-    } else {
-      // Get the path to the piper executable so we can locate espeak-ng-data
-      // next to it.
+  // Get the path to the piper executable so we can locate espeak-ng-data, etc.
+  // next to it.
 #ifdef _MSC_VER
-      auto exePath = []() {
-        wchar_t moduleFileName[MAX_PATH] = {0};
-        GetModuleFileNameW(nullptr, moduleFileName, std::size(moduleFileName));
-        return filesystem::path(moduleFileName);
-      }();
+  auto exePath = []() {
+    wchar_t moduleFileName[MAX_PATH] = {0};
+    GetModuleFileNameW(nullptr, moduleFileName, std::size(moduleFileName));
+    return filesystem::path(moduleFileName);
+  }();
 #elifdef __APPLE__
  auto exePath = []() {
-    char moduleFileName[PATH_MAX] = { 0 };
+    char moduleFileName[PATH_MAX] = {0};
    uint32_t moduleFileNameSize = std::size(moduleFileName);
    _NSGetExecutablePath(moduleFileName, &moduleFileNameSize);
    return filesystem::path(moduleFileName);
  }();
 #else
-      auto exePath = filesystem::canonical("/proc/self/exe");
+  auto exePath = filesystem::canonical("/proc/self/exe");
 #endif

+  if (voice.phonemizeConfig.phonemeType == piper::eSpeakPhonemes) {
+    spdlog::debug("Voice uses eSpeak phonemes ({})",
+                  voice.phonemizeConfig.eSpeak.voice);
+
+    if (runConfig.eSpeakDataPath) {
+      // User provided path
+      piperConfig.eSpeakDataPath = runConfig.eSpeakDataPath.value().string();
+    } else {
+      // Assume next to piper executable
      piperConfig.eSpeakDataPath =
          std::filesystem::absolute(
              exePath.parent_path().append("espeak-ng-data"))
@ -130,6 +135,25 @@ int main(int argc, char *argv[]) {
    piperConfig.useESpeak = false;
  }

+  // Enable libtashkeel for Arabic
+  if (voice.phonemizeConfig.eSpeak.voice == "ar") {
+    piperConfig.useTashkeel = true;
+    if (runConfig.tashkeelModelPath) {
+      // User provided path
+      piperConfig.tashkeelModelPath =
+          runConfig.tashkeelModelPath.value().string();
+    } else {
+      // Assume next to piper executable
+      piperConfig.tashkeelModelPath =
+          std::filesystem::absolute(
+              exePath.parent_path().append("libtashkeel_model.ort"))
+              .string();
+
+      spdlog::debug("libtashkeel model is expected at {}",
+                    piperConfig.tashkeelModelPath.value());
+    }
+  }
+
  piper::initialize(piperConfig);

  // Scales
@ -365,6 +389,9 @@ void parseArgs(int argc, char *argv[], RunConfig &runConfig) {
    } else if (arg == "--espeak_data" || arg == "--espeak-data") {
      ensureArg(argc, argv, i);
      runConfig.eSpeakDataPath = filesystem::path(argv[++i]);
+    } else if (arg == "--tashkeel_model" || arg == "--tashkeel-model") {
+      ensureArg(argc, argv, i);
+      runConfig.tashkeelModelPath = filesystem::path(argv[++i]);
    } else if (arg == "--debug") {
      // Set DEBUG logging
      spdlog::set_level(spdlog::level::debug);
--- a/src/cpp/piper.cpp
+++ b/src/cpp/piper.cpp
@ -47,13 +47,9 @@ void parsePhonemizeConfig(json &configRoot, PhonemizeConfig &phonemizeConfig) {
  // }

  if (configRoot.contains("espeak")) {
-    if (!phonemizeConfig.eSpeak) {
-      phonemizeConfig.eSpeak.emplace();
-    }
-
    auto espeakValue = configRoot["espeak"];
    if (espeakValue.contains("voice")) {
-      phonemizeConfig.eSpeak->voice = espeakValue["voice"].get<std::string>();
+      phonemizeConfig.eSpeak.voice = espeakValue["voice"].get<std::string>();
    }
  }

@ -175,6 +171,22 @@ void initialize(PiperConfig &config) {
    spdlog::debug("Initialized eSpeak");
  }

+  // Load onnx model for libtashkeel
+  // https://github.com/mush42/libtashkeel/
+  if (config.useTashkeel) {
+    spdlog::debug("Using libtashkeel for diacritization");
+    if (!config.tashkeelModelPath) {
+      throw std::runtime_error("No path to libtashkeel model");
+    }
+
+    spdlog::debug("Loading libtashkeel model from {}",
+                  config.tashkeelModelPath.value());
+    config.tashkeelState = std::make_unique<tashkeel::State>();
+    tashkeel::tashkeel_load(config.tashkeelModelPath.value(),
+                            *config.tashkeelState);
+    spdlog::debug("Initialized libtashkeel");
+  }
+
  spdlog::info("Initialized piper");
 }

@ -368,6 +380,15 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text,
        voice.synthesisConfig.sampleRate * voice.synthesisConfig.channels);
  }

+  if (config.useTashkeel) {
+    if (!config.tashkeelState) {
+      throw std::runtime_error("Tashkeel model is not loaded");
+    }
+
+    spdlog::debug("Diacritizing text with libtashkeel: {}", text);
+    text = tashkeel::tashkeel_run(text, *config.tashkeelState);
+  }
+
  // Phonemes for each sentence
  spdlog::debug("Phonemizing text: {}", text);
  std::vector<std::vector<Phoneme>> phonemes;
@ -375,7 +396,7 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text,
  if (voice.phonemizeConfig.phonemeType == eSpeakPhonemes) {
    // Use espeak-ng for phonemization
    eSpeakPhonemeConfig eSpeakConfig;
-    eSpeakConfig.voice = voice.phonemizeConfig.eSpeak->voice;
+    eSpeakConfig.voice = voice.phonemizeConfig.eSpeak.voice;
    phonemize_eSpeak(text, eSpeakConfig, phonemes);
  } else {
    // Use UTF-8 codepoints as "phonemes"
@ -405,7 +426,7 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text,

    PhonemeIdConfig idConfig;
    if (voice.phonemizeConfig.phonemeType == TextPhonemes) {
-      auto &language = voice.phonemizeConfig.eSpeak->voice;
+      auto &language = voice.phonemizeConfig.eSpeak.voice;
      spdlog::debug("Text phoneme language: {}", language);
      if (DEFAULT_ALPHABET.count(language) < 1) {
        throw std::runtime_error(
--- a/src/cpp/piper.hpp
+++ b/src/cpp/piper.hpp
@ -1,8 +1,8 @@
 #ifndef PIPER_H_
 #define PIPER_H_

-#include <functional>
 #include <fstream>
+#include <functional>
 #include <optional>
 #include <string>
 #include <vector>
@ -10,6 +10,7 @@
 #include <onnxruntime_cxx_api.h>
 #include <phoneme_ids.hpp>
 #include <phonemize.hpp>
+#include <tashkeel.hpp>

 #include "json.hpp"

@ -26,6 +27,10 @@ struct eSpeakConfig {
 struct PiperConfig {
  std::string eSpeakDataPath;
  bool useESpeak = true;
+
+  bool useTashkeel = false;
+  std::optional<std::string> tashkeelModelPath;
+  std::unique_ptr<tashkeel::State> tashkeelState;
 };

 enum PhonemeType { eSpeakPhonemes, TextPhonemes };
@ -40,7 +45,7 @@ struct PhonemizeConfig {
  PhonemeId idEos = 2; // end of sentence
  bool interspersePad = true;

-  std::optional<eSpeakConfig> eSpeak;
+  eSpeakConfig eSpeak;
 };

 struct SynthesisConfig {