First working version with libpiper_phonemize

12 months ago · 810fad44cf
parent 7d27863b48
commit 810fad44cf
11 changed files with 503 additions and 776 deletions
--- a/16
+++ b/16
@ -1,16 +1,12 @@
-.PHONY: release debug clean test
+.PHONY: piper clean test

-release:
-	mkdir -p build
-	cd build && cmake ../src/cpp -DCMAKE_BUILD_TYPE=Release && make
+LIB_DIR := lib/Linux-$(shell uname -m)

-no-pcaudio:
+piper:
 	mkdir -p build
-	cd build && cmake ../src/cpp -DCMAKE_BUILD_TYPE=Release -DUSE_PCAUDIO=OFF && make
-
-debug:
-	mkdir -p build
-	cd build && cmake ../src/cpp -DCMAKE_BUILD_TYPE=Debug && make
+	cd build && cmake ../src/cpp -DCMAKE_BUILD_TYPE=Release && make
+	cp -aR $(LIB_DIR)/piper_phonemize/espeak-ng-data $(LIB_DIR)/piper_phonemize/lib/*.so* build/
+	cp -a $(LIB_DIR)/onnxruntime/lib/*.so* build/

 clean:
 	rm -rf build/ dist/
--- a/lib/espeak-ng-1.52-patched.tar.gz
+++ b/lib/espeak-ng-1.52-patched.tar.gz
--- a/src/cpp/CMakeLists.txt
+++ b/src/cpp/CMakeLists.txt
@ -4,47 +4,31 @@ include(CheckIncludeFileCXX)

 project(piper C CXX)

-set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)

-ADD_EXECUTABLE(piper main.cpp)
+ADD_EXECUTABLE(piper main.cpp piper.cpp)

 string(APPEND CMAKE_CXX_FLAGS " -Wall -Wextra -Wl,-rpath,'$ORIGIN'")
 string(APPEND CMAKE_C_FLAGS " -Wall -Wextra")

-find_package(PkgConfig)
-pkg_check_modules(ESPEAK_NG REQUIRED espeak-ng<2)
-
-# https://github.com/espeak-ng/pcaudiolib
-check_include_file_cxx("pcaudiolib/audio.h" PCAUDIO_INCLUDE_FOUND)
-
-if(PCAUDIO_INCLUDE_FOUND)
-  option(USE_PCAUDIO "Build with pcaudiolib" ON)
-  if(USE_PCAUDIO)
-    target_compile_definitions(piper PUBLIC HAVE_PCAUDIO)
-    set(PCAUDIO_LIBRARIES "pcaudio")
-  endif()
-endif()
-
-set(ONNXRUNTIME_ROOTDIR ${CMAKE_CURRENT_LIST_DIR}/../../lib/${CMAKE_HOST_SYSTEM_NAME}-${CMAKE_HOST_SYSTEM_PROCESSOR})
+set(PIPER_PHONEMIZE_ROOTDIR ${CMAKE_CURRENT_LIST_DIR}/../../lib/${CMAKE_HOST_SYSTEM_NAME}-${CMAKE_HOST_SYSTEM_PROCESSOR}/piper_phonemize)
+set(ONNXRUNTIME_ROOTDIR ${CMAKE_CURRENT_LIST_DIR}/../../lib/${CMAKE_HOST_SYSTEM_NAME}-${CMAKE_HOST_SYSTEM_PROCESSOR}/onnxruntime)

 target_link_libraries(piper
+                      piper_phonemize
+                      espeak-ng
                      onnxruntime
-                      pthread
-                      ${ESPEAK_NG_LIBRARIES}
-                      ${PCAUDIO_LIBRARIES})
+                      pthread)

 if(NOT APPLE)
  target_link_libraries(-static-libgcc -static-libstdc++)
 endif()

 target_link_directories(piper PUBLIC
-                        ${ESPEAK_NG_LIBRARY_DIRS}
+                        ${PIPER_PHONEMIZE_ROOTDIR}/lib
                        ${ONNXRUNTIME_ROOTDIR}/lib)

 target_include_directories(piper PUBLIC
-                           ${ONNXRUNTIME_ROOTDIR}/include
-                           ${ESPEAK_NG_INCLUDE_DIRS})
-
-target_compile_options(piper PUBLIC
-                       ${ESPEAK_NG_CFLAGS_OTHER})
+                           ${PIPER_PHONEMIZE_ROOTDIR}/include
+                           ${ONNXRUNTIME_ROOTDIR}/include)
--- a/src/cpp/config.hpp
+++ b/src/cpp/config.hpp
@ -1,155 +0,0 @@
-#ifndef CONFIG_H_
-#define CONFIG_H_
-
-#include <filesystem>
-#include <map>
-#include <optional>
-#include <set>
-#include <stdexcept>
-#include <string>
-#include <vector>
-
-#include "json.hpp"
-#include "utf8.h"
-
-using namespace std;
-using json = nlohmann::json;
-
-namespace piper {
-
-typedef char32_t Phoneme;
-typedef int64_t PhonemeId;
-typedef int64_t SpeakerId;
-
-const string DefaultVoice = "en-us";
-
-enum eSpeakMode { Text, TextWithPhonemes, SSML };
-
-struct eSpeakConfig {
-  string voice = DefaultVoice;
-  eSpeakMode mode = Text;
-
-  // Characters that eSpeak uses to break apart paragraphs/sentences
-  set<Phoneme> clauseBreakers{U'.', U'?', U'!', U',', U';', U':'};
-
-  Phoneme fullStop = U'.';
-  Phoneme comma = U',';
-  Phoneme question = U'?';
-  Phoneme exclamation = U'!';
-};
-
-struct PhonemizeConfig {
-  optional<map<Phoneme, vector<Phoneme>>> phonemeMap;
-  map<Phoneme, vector<PhonemeId>> phonemeIdMap;
-
-  PhonemeId idPad = 0; // padding (optionally interspersed)
-  PhonemeId idBos = 1; // beginning of sentence
-  PhonemeId idEos = 2; // end of sentence
-  bool interspersePad = true;
-
-  optional<eSpeakConfig> eSpeak;
-};
-
-struct SynthesisConfig {
-  float noiseScale = 0.667f;
-  float lengthScale = 1.0f;
-  float noiseW = 0.8f;
-  int sampleRate = 22050;
-  int sampleWidth = 2; // 16-bit
-  int channels = 1;    // mono
-  optional<SpeakerId> speakerId;
-  float sentenceSilenceSeconds = 0.2f;
-};
-
-struct ModelConfig {
-  int numSpeakers;
-};
-
-bool isSingleCodepoint(string s) {
-  return utf8::distance(s.begin(), s.end()) == 1;
-}
-
-Phoneme getCodepoint(string s) {
-  utf8::iterator character_iter(s.begin(), s.begin(), s.end());
-  return *character_iter;
-}
-
-void parsePhonemizeConfig(json &configRoot, PhonemizeConfig &phonemizeConfig) {
-
-  if (configRoot.contains("espeak")) {
-    if (!phonemizeConfig.eSpeak) {
-      phonemizeConfig.eSpeak.emplace();
-    }
-
-    auto espeakValue = configRoot["espeak"];
-    if (espeakValue.contains("voice")) {
-      phonemizeConfig.eSpeak->voice = espeakValue["voice"].get<string>();
-    }
-  }
-
-  // phoneme to [phoneme] map
-  if (configRoot.contains("phoneme_map")) {
-    if (!phonemizeConfig.phonemeMap) {
-      phonemizeConfig.phonemeMap.emplace();
-    }
-
-    auto phonemeMapValue = configRoot["phoneme_map"];
-    for (auto &fromPhonemeItem : phonemeMapValue.items()) {
-      string fromPhoneme = fromPhonemeItem.key();
-      if (!isSingleCodepoint(fromPhoneme)) {
-        throw runtime_error("Phonemes must be one codepoint (phoneme map)");
-      }
-
-      auto fromCodepoint = getCodepoint(fromPhoneme);
-      for (auto &toPhonemeValue : fromPhonemeItem.value()) {
-        string toPhoneme = toPhonemeValue.get<string>();
-        if (!isSingleCodepoint(toPhoneme)) {
-          throw runtime_error("Phonemes must be one codepoint (phoneme map)");
-        }
-
-        auto toCodepoint = getCodepoint(toPhoneme);
-        (*phonemizeConfig.phonemeMap)[fromCodepoint].push_back(toCodepoint);
-      }
-    }
-  }
-
-  // phoneme to [id] map
-  if (configRoot.contains("phoneme_id_map")) {
-    auto phonemeIdMapValue = configRoot["phoneme_id_map"];
-    for (auto &fromPhonemeItem : phonemeIdMapValue.items()) {
-      string fromPhoneme = fromPhonemeItem.key();
-      if (!isSingleCodepoint(fromPhoneme)) {
-        throw runtime_error("Phonemes must be one codepoint (phoneme id map)");
-      }
-
-      auto fromCodepoint = getCodepoint(fromPhoneme);
-      for (auto &toIdValue : fromPhonemeItem.value()) {
-        PhonemeId toId = toIdValue.get<PhonemeId>();
-        phonemizeConfig.phonemeIdMap[fromCodepoint].push_back(toId);
-      }
-    }
-  }
-
-} /* parsePhonemizeConfig */
-
-void parseSynthesisConfig(json &configRoot, SynthesisConfig &synthesisConfig) {
-
-  if (configRoot.contains("audio")) {
-    auto audioValue = configRoot["audio"];
-    if (audioValue.contains("sample_rate")) {
-      // Default sample rate is 22050 Hz
-      synthesisConfig.sampleRate = audioValue.value("sample_rate", 22050);
-    }
-  }
-
-} /* parseSynthesisConfig */
-
-void parseModelConfig(json &configRoot, ModelConfig &modelConfig) {
-
-  modelConfig.numSpeakers = configRoot["num_speakers"].get<SpeakerId>();
-
-} /* parseModelConfig */
-
-} // namespace piper
-
-#endif // CONFIG_H_
--- a/src/cpp/main.cpp
+++ b/src/cpp/main.cpp
@ -2,6 +2,7 @@
 #include <condition_variable>
 #include <filesystem>
 #include <fstream>
+#include <functional>
 #include <iostream>
 #include <mutex>
 #include <sstream>
@ -10,11 +11,6 @@
 #include <thread>
 #include <vector>

-#ifdef HAVE_PCAUDIO
-// https://github.com/espeak-ng/pcaudiolib
-#include <pcaudiolib/audio.h>
-#endif
-
 #ifdef _MSC_VER
 #define WIN32_LEAN_AND_MEAN
 #define NOMINMAX
@ -29,19 +25,13 @@

 using namespace std;

-enum OutputType {
-  OUTPUT_FILE,
-  OUTPUT_DIRECTORY,
-  OUTPUT_STDOUT,
-  OUTPUT_PLAY,
-  OUTPUT_RAW
-};
+enum OutputType { OUTPUT_FILE, OUTPUT_DIRECTORY, OUTPUT_STDOUT, OUTPUT_RAW };

 struct RunConfig {
  filesystem::path modelPath;
  filesystem::path modelConfigPath;
-  OutputType outputType = OUTPUT_PLAY;
-  optional<filesystem::path> outputPath;
+  OutputType outputType = OUTPUT_DIRECTORY;
+  optional<filesystem::path> outputPath = filesystem::path(".");
  optional<piper::SpeakerId> speakerId;
  optional<float> noiseScale;
  optional<float> lengthScale;
@ -53,12 +43,6 @@ void rawOutputProc(vector<int16_t> &sharedAudioBuffer, mutex &mutAudio,
                   condition_variable &cvAudio, bool &audioReady,
                   bool &audioFinished);

-#ifdef HAVE_PCAUDIO
-void playProc(audio_object *my_audio, vector<int16_t> &sharedAudioBuffer,
-              mutex &mutAudio, condition_variable &cvAudio, bool &audioReady,
-              bool &audioFinished);
-#endif
-
 int main(int argc, char *argv[]) {
  RunConfig runConfig;
  parseArgs(argc, argv, runConfig);
@ -66,7 +50,7 @@ int main(int argc, char *argv[]) {
  // NOTE: This won't work for Windows (need GetModuleFileName)
 #ifdef _MSC_VER
  auto exePath = []() {
-    wchar_t moduleFileName[MAX_PATH] = { 0 };
+    wchar_t moduleFileName[MAX_PATH] = {0};
    GetModuleFileNameW(nullptr, moduleFileName, std::size(moduleFileName));
    return filesystem::path(moduleFileName);
  }();
@ -81,17 +65,22 @@ int main(int argc, char *argv[]) {
 #else
  auto exePath = filesystem::canonical("/proc/self/exe");
 #endif
-#endif
-  piper::initialize(exePath.parent_path());
+
+  piper::PiperConfig piperConfig;
+  piperConfig.eSpeakDataPath =
+      std::filesystem::absolute(exePath.parent_path().append("espeak-ng-data"))
+          .string();

  piper::Voice voice;
  auto startTime = chrono::steady_clock::now();
-  loadVoice(runConfig.modelPath.string(), runConfig.modelConfigPath.string(),
-            voice, runConfig.speakerId);
+  loadVoice(piperConfig, runConfig.modelPath.string(),
+            runConfig.modelConfigPath.string(), voice, runConfig.speakerId);
  auto endTime = chrono::steady_clock::now();
  auto loadSeconds = chrono::duration<double>(endTime - startTime).count();
  cerr << "Load time: " << loadSeconds << " sec" << endl;

+  piper::initialize(piperConfig);
+
  // Scales
  if (runConfig.noiseScale) {
    voice.synthesisConfig.noiseScale = runConfig.noiseScale.value();
@ -105,33 +94,6 @@ int main(int argc, char *argv[]) {
    voice.synthesisConfig.noiseW = runConfig.noiseW.value();
  }

-#ifdef HAVE_PCAUDIO
-  audio_object *my_audio = nullptr;
-
-  if (runConfig.outputType == OUTPUT_PLAY) {
-    // Output audio to the default audio device
-    my_audio = create_audio_device_object(NULL, "piper", "Text-to-Speech");
-
-    // TODO: Support 32-bit sample widths
-    auto audioFormat = AUDIO_OBJECT_FORMAT_S16LE;
-    int error = audio_object_open(my_audio, audioFormat,
-                                  voice.synthesisConfig.sampleRate,
-                                  voice.synthesisConfig.channels);
-    if (error != 0) {
-      throw runtime_error(audio_object_strerror(my_audio, error));
-    }
-  }
-#else
-  if (runConfig.outputType == OUTPUT_PLAY) {
-    // Cannot play audio directly
-    cerr << "WARNING: Piper was not compiled with pcaudiolib. Output audio "
-            "will be written to the current directory."
-         << endl;
-    runConfig.outputType = OUTPUT_DIRECTORY;
-    runConfig.outputPath = filesystem::path(".");
-  }
-#endif
-
  if (runConfig.outputType == OUTPUT_DIRECTORY) {
    runConfig.outputPath = filesystem::absolute(runConfig.outputPath.value());
    cerr << "Output directory: " << runConfig.outputPath.value() << endl;
@ -155,7 +117,7 @@ int main(int argc, char *argv[]) {

      // Output audio to automatically-named WAV file in a directory
      ofstream audioFile(outputPath.string(), ios::binary);
-      piper::textToWavFile(voice, line, audioFile, result);
+      piper::textToWavFile(piperConfig, voice, line, audioFile, result);
      cout << outputPath.string() << endl;
    } else if (runConfig.outputType == OUTPUT_FILE) {
      // Read all of standard input before synthesizing.
@ -168,10 +130,10 @@ int main(int argc, char *argv[]) {

      // Output audio to WAV file
      ofstream audioFile(runConfig.outputPath.value().string(), ios::binary);
-      piper::textToWavFile(voice, text.str(), audioFile, result);
+      piper::textToWavFile(piperConfig, voice, text.str(), audioFile, result);
    } else if (runConfig.outputType == OUTPUT_STDOUT) {
      // Output WAV to stdout
-      piper::textToWavFile(voice, line, cout, result);
+      piper::textToWavFile(piperConfig, voice, line, cout, result);
    } else if (runConfig.outputType == OUTPUT_RAW) {
      // Raw output to stdout
      mutex mutAudio;
@ -195,7 +157,8 @@ int main(int argc, char *argv[]) {
          cvAudio.notify_one();
        }
      };
-      piper::textToAudio(voice, line, audioBuffer, result, audioCallback);
+      piper::textToAudio(piperConfig, voice, line, audioBuffer, result,
+                         audioCallback);

      // Signal thread that there is no more audio
      {
@ -208,45 +171,6 @@ int main(int argc, char *argv[]) {
      // Wait for audio output to finish
      cerr << "Waiting for audio..." << endl;
      rawOutputThread.join();
-    } else if (runConfig.outputType == OUTPUT_PLAY) {
-#ifdef HAVE_PCAUDIO
-      mutex mutAudio;
-      condition_variable cvAudio;
-      bool audioReady = false;
-      bool audioFinished = false;
-      vector<int16_t> audioBuffer;
-      vector<int16_t> sharedAudioBuffer;
-
-      thread playThread(playProc, my_audio, ref(sharedAudioBuffer),
-                        ref(mutAudio), ref(cvAudio), ref(audioReady),
-                        ref(audioFinished));
-      auto audioCallback = [&audioBuffer, &sharedAudioBuffer, &mutAudio,
-                            &cvAudio, &audioReady]() {
-        // Signal thread that audio is ready
-        {
-          unique_lock lockAudio(mutAudio);
-          copy(audioBuffer.begin(), audioBuffer.end(),
-               back_inserter(sharedAudioBuffer));
-          audioReady = true;
-          cvAudio.notify_one();
-        }
-      };
-      piper::textToAudio(voice, line, audioBuffer, result, audioCallback);
-
-      // Signal thread that there is no more audio
-      {
-        unique_lock lockAudio(mutAudio);
-        audioReady = true;
-        audioFinished = true;
-        cvAudio.notify_one();
-      }
-
-      // Wait for audio output to finish
-      cerr << "Waiting for audio..." << endl;
-      playThread.join();
-#else
-      throw runtime_error("Cannot play audio! Not compiled with pcaudiolib.");
-#endif
    }

    cerr << "Real-time factor: " << result.realTimeFactor
@ -254,13 +178,7 @@ int main(int argc, char *argv[]) {
         << " sec, audio=" << result.audioSeconds << " sec)" << endl;
  }

-  piper::terminate();
-
-#ifdef HAVE_PCAUDIO
-  audio_object_close(my_audio);
-  audio_object_destroy(my_audio);
-  my_audio = nullptr;
-#endif
+  piper::terminate(piperConfig);

  return EXIT_SUCCESS;
 }
@ -296,43 +214,6 @@ void rawOutputProc(vector<int16_t> &sharedAudioBuffer, mutex &mutAudio,

 } // rawOutputProc

-#ifdef HAVE_PCAUDIO
-void playProc(audio_object *my_audio, vector<int16_t> &sharedAudioBuffer,
-              mutex &mutAudio, condition_variable &cvAudio, bool &audioReady,
-              bool &audioFinished) {
-  vector<int16_t> internalAudioBuffer;
-  while (true) {
-    {
-      unique_lock lockAudio{mutAudio};
-      cvAudio.wait(lockAudio, [&audioReady] { return audioReady; });
-
-      if (sharedAudioBuffer.empty() && audioFinished) {
-        break;
-      }
-
-      copy(sharedAudioBuffer.begin(), sharedAudioBuffer.end(),
-           back_inserter(internalAudioBuffer));
-
-      sharedAudioBuffer.clear();
-
-      if (!audioFinished) {
-        audioReady = false;
-      }
-    }
-
-    int error =
-        audio_object_write(my_audio, (const char *)internalAudioBuffer.data(),
-                           sizeof(int16_t) * internalAudioBuffer.size());
-    if (error != 0) {
-      throw runtime_error(audio_object_strerror(my_audio, error));
-    }
-    audio_object_flush(my_audio);
-    internalAudioBuffer.clear();
-  }
-
-} // playProc
-#endif
-
 void printUsage(char *argv[]) {
  cerr << endl;
  cerr << "usage: " << argv[0] << " [options]" << endl;
--- a/src/cpp/model.hpp
+++ b/src/cpp/model.hpp
@ -1,53 +0,0 @@
-#ifndef MODEL_H_
-#define MODEL_H_
-
-#include <string>
-
-#include <onnxruntime_cxx_api.h>
-
-using namespace std;
-
-namespace piper {
-const string instanceName{"piper"};
-
-struct ModelSession {
-  Ort::Session onnx;
-  Ort::AllocatorWithDefaultOptions allocator;
-  Ort::SessionOptions options;
-  Ort::Env env;
-
-  ModelSession() : onnx(nullptr){};
-};
-
-void loadModel(string modelPath, ModelSession &session) {
-
-  session.env = Ort::Env(OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING,
-                         instanceName.c_str());
-  session.env.DisableTelemetryEvents();
-
-  // Slows down performance by ~2x
-  // session.options.SetIntraOpNumThreads(1);
-
-  // Roughly doubles load time for no visible inference benefit
-  // session.options.SetGraphOptimizationLevel(
-  //     GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
-
-  session.options.SetGraphOptimizationLevel(
-      GraphOptimizationLevel::ORT_DISABLE_ALL);
-
-  // Slows down performance very slightly
-  // session.options.SetExecutionMode(ExecutionMode::ORT_PARALLEL);
-
-  session.options.DisableCpuMemArena();
-  session.options.DisableMemPattern();
-  session.options.DisableProfiling();
-
-  auto startTime = chrono::steady_clock::now();
-  session.onnx = Ort::Session(session.env, filesystem::path(modelPath).c_str(), session.options);
-  auto endTime = chrono::steady_clock::now();
-  auto loadDuration = chrono::duration<double>(endTime - startTime);
-}
-
-} // namespace piper
-
-#endif // MODEL_H_
--- a/src/cpp/phonemize.hpp
+++ b/src/cpp/phonemize.hpp
@ -1,142 +0,0 @@
-#ifndef PHONEMIZE_H_
-#define PHONEMIZE_H_
-
-#include <filesystem>
-#include <iostream>
-#include <map>
-#include <optional>
-#include <set>
-#include <stdexcept>
-#include <string>
-#include <vector>
-
-#include <espeak-ng/speak_lib.h>
-
-#include "config.hpp"
-#include "utf8.h"
-
-#define CLAUSE_INTONATION_FULL_STOP   0x00000000
-#define CLAUSE_INTONATION_COMMA       0x00001000
-#define CLAUSE_INTONATION_QUESTION    0x00002000
-#define CLAUSE_INTONATION_EXCLAMATION 0x00003000
-
-#define CLAUSE_TYPE_SENTENCE          0x00080000
-
-using namespace std;
-
-namespace piper {
-
-// Text to phonemes using eSpeak-ng
-void phonemize(string text, PhonemizeConfig &phonemizeConfig,
-               vector<vector<Phoneme>> &phonemes) {
-  if (!phonemizeConfig.eSpeak) {
-    throw runtime_error("Missing eSpeak config");
-  }
-
-  auto voice = phonemizeConfig.eSpeak->voice;
-  int result = espeak_SetVoiceByName(voice.c_str());
-  if (result != 0) {
-    throw runtime_error("Failed to set eSpeak-ng voice");
-  }
-
-  // Modified by eSpeak
-  string textCopy(text);
-
-  utf8::iterator textIter(textCopy.begin(), textCopy.begin(), textCopy.end());
-  utf8::iterator textIterEnd(textCopy.end(), textCopy.begin(), textCopy.end());
-  vector<char32_t> textClauseBreakers;
-
-  // Identify clause breakers in the sentence, since eSpeak removes them during
-  // phonemization.
-  //
-  // This will unfortunately do the wrong thing with abbreviations, etc.
-  while (textIter != textIterEnd) {
-    auto codepoint = *textIter;
-    if (phonemizeConfig.eSpeak->clauseBreakers.contains(codepoint)) {
-      textClauseBreakers.push_back(codepoint);
-    }
-
-    textIter++;
-  }
-
-  vector<Phoneme> *sentencePhonemes = nullptr;
-  const char *inputTextPointer = textCopy.c_str();
-  int terminator = 0;
-
-  while (inputTextPointer != NULL) {
-    // Modified espeak-ng API to get access to clause terminator
-    string clausePhonemes(
-        espeak_TextToPhonemes2((const void **)&inputTextPointer,
-                              /*textmode*/ espeakCHARS_AUTO,
-                              /*phonememode = IPA*/ 0x02,
-                               &terminator));
-
-    utf8::iterator phonemeIter(clausePhonemes.begin(), clausePhonemes.begin(),
-                               clausePhonemes.end());
-    utf8::iterator phonemeEnd(clausePhonemes.end(), clausePhonemes.begin(),
-                              clausePhonemes.end());
-
-    if (!sentencePhonemes) {
-      // Start new sentence
-      phonemes.emplace_back();
-      sentencePhonemes = &phonemes[phonemes.size() - 1];
-    }
-
-    sentencePhonemes->insert(sentencePhonemes->end(), phonemeIter, phonemeEnd);
-
-    // Add appropriate puntuation depending on terminator type
-    int intonation = terminator & 0x0000F000;
-    if (intonation == CLAUSE_INTONATION_FULL_STOP) {
-      sentencePhonemes->push_back(phonemizeConfig.eSpeak->fullStop);
-    } else if (intonation == CLAUSE_INTONATION_COMMA) {
-      sentencePhonemes->push_back(phonemizeConfig.eSpeak->comma);
-    } else if (intonation == CLAUSE_INTONATION_QUESTION) {
-      sentencePhonemes->push_back(phonemizeConfig.eSpeak->question);
-    } else if (intonation == CLAUSE_INTONATION_EXCLAMATION) {
-      sentencePhonemes->push_back(phonemizeConfig.eSpeak->exclamation);
-    }
-
-    if ((terminator & CLAUSE_TYPE_SENTENCE) == CLAUSE_TYPE_SENTENCE) {
-        // End of sentence
-        sentencePhonemes = nullptr;
-    }
-
-  }  // while inputTextPointer != NULL
-
-} /* phonemize */
-
-// Phonemes to ids using JSON map
-void phonemes2ids(vector<Phoneme> &phonemes, PhonemizeConfig &phonemizeConfig,
-                  vector<PhonemeId> &phonemeIds) {
-  if (phonemes.empty()) {
-    throw runtime_error("No phonemes");
-  }
-
-  phonemeIds.push_back(phonemizeConfig.idBos);
-  if (phonemizeConfig.interspersePad) {
-    phonemeIds.push_back(phonemizeConfig.idPad);
-  }
-
-  for (auto phoneme = phonemes.begin(); phoneme != phonemes.end(); phoneme++) {
-    if (phonemizeConfig.phonemeIdMap.contains(*phoneme)) {
-      for (auto id : phonemizeConfig.phonemeIdMap[*phoneme]) {
-        phonemeIds.push_back(id);
-
-        if (phonemizeConfig.interspersePad) {
-          phonemeIds.push_back(phonemizeConfig.idPad);
-        }
-      }
-    } else {
-      string phonemeStr;
-      utf8::append(*phoneme, phonemeStr);
-      cerr << "[WARN] No id for phoneme: " << phonemeStr << endl;
-    }
-  }
-
-  phonemeIds.push_back(phonemizeConfig.idEos);
-
-} /* phonemes2ids */
-
-} // namespace piper
-
-#endif // PHONEMIZE_H_
--- a/src/cpp/piper.cpp
+++ b/src/cpp/piper.cpp
@ -0,0 +1,393 @@
+#include <array>
+#include <chrono>
+#include <fstream>
+#include <limits>
+#include <stdexcept>
+
+#include <espeak-ng/speak_lib.h>
+#include <onnxruntime_cxx_api.h>
+
+#include "piper.hpp"
+#include "utf8.h"
+#include "wavfile.hpp"
+
+namespace piper {
+
+// Maximum value for 16-bit signed WAV sample
+const float MAX_WAV_VALUE = 32767.0f;
+
+const std::string instanceName{"piper"};
+
+bool isSingleCodepoint(std::string s) {
+  return utf8::distance(s.begin(), s.end()) == 1;
+}
+
+Phoneme getCodepoint(std::string s) {
+  utf8::iterator character_iter(s.begin(), s.begin(), s.end());
+  return *character_iter;
+}
+
+void parsePhonemizeConfig(json &configRoot, PhonemizeConfig &phonemizeConfig) {
+
+  if (configRoot.contains("espeak")) {
+    if (!phonemizeConfig.eSpeak) {
+      phonemizeConfig.eSpeak.emplace();
+    }
+
+    auto espeakValue = configRoot["espeak"];
+    if (espeakValue.contains("voice")) {
+      phonemizeConfig.eSpeak->voice = espeakValue["voice"].get<std::string>();
+    }
+  }
+
+  if (configRoot.contains("phoneme_type")) {
+    auto phonemeTypeStr = configRoot["phoneme_type"].get<std::string>();
+    if (phonemeTypeStr == "text") {
+      phonemizeConfig.phonemeType = TextPhonemes;
+    }
+  }
+
+  // phoneme to [phoneme] map
+  if (configRoot.contains("phoneme_map")) {
+    if (!phonemizeConfig.phonemeMap) {
+      phonemizeConfig.phonemeMap.emplace();
+    }
+
+    auto phonemeMapValue = configRoot["phoneme_map"];
+    for (auto &fromPhonemeItem : phonemeMapValue.items()) {
+      std::string fromPhoneme = fromPhonemeItem.key();
+      if (!isSingleCodepoint(fromPhoneme)) {
+        throw std::runtime_error(
+            "Phonemes must be one codepoint (phoneme map)");
+      }
+
+      auto fromCodepoint = getCodepoint(fromPhoneme);
+      for (auto &toPhonemeValue : fromPhonemeItem.value()) {
+        std::string toPhoneme = toPhonemeValue.get<std::string>();
+        if (!isSingleCodepoint(toPhoneme)) {
+          throw std::runtime_error(
+              "Phonemes must be one codepoint (phoneme map)");
+        }
+
+        auto toCodepoint = getCodepoint(toPhoneme);
+        (*phonemizeConfig.phonemeMap)[fromCodepoint].push_back(toCodepoint);
+      }
+    }
+  }
+
+  // phoneme to [id] map
+  if (configRoot.contains("phoneme_id_map")) {
+    auto phonemeIdMapValue = configRoot["phoneme_id_map"];
+    for (auto &fromPhonemeItem : phonemeIdMapValue.items()) {
+      std::string fromPhoneme = fromPhonemeItem.key();
+      if (!isSingleCodepoint(fromPhoneme)) {
+        throw std::runtime_error(
+            "Phonemes must be one codepoint (phoneme id map)");
+      }
+
+      auto fromCodepoint = getCodepoint(fromPhoneme);
+      for (auto &toIdValue : fromPhonemeItem.value()) {
+        PhonemeId toId = toIdValue.get<PhonemeId>();
+        phonemizeConfig.phonemeIdMap[fromCodepoint].push_back(toId);
+      }
+    }
+  }
+
+} /* parsePhonemizeConfig */
+
+void parseSynthesisConfig(json &configRoot, SynthesisConfig &synthesisConfig) {
+
+  if (configRoot.contains("audio")) {
+    auto audioValue = configRoot["audio"];
+    if (audioValue.contains("sample_rate")) {
+      // Default sample rate is 22050 Hz
+      synthesisConfig.sampleRate = audioValue.value("sample_rate", 22050);
+    }
+  }
+
+} /* parseSynthesisConfig */
+
+void parseModelConfig(json &configRoot, ModelConfig &modelConfig) {
+
+  modelConfig.numSpeakers = configRoot["num_speakers"].get<SpeakerId>();
+
+} /* parseModelConfig */
+
+void initialize(PiperConfig &config) {
+  if (config.useESpeak) {
+    // Set up espeak-ng for calling espeak_TextToPhonemesWithTerminator
+    // See: https://github.com/rhasspy/espeak-ng
+    int result = espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS,
+                                   /*buflength*/ 0,
+                                   /*path*/ config.eSpeakDataPath.c_str(),
+                                   /*options*/ 0);
+    if (result < 0) {
+      throw std::runtime_error("Failed to initialize eSpeak-ng");
+    }
+  }
+}
+
+void terminate(PiperConfig &config) {
+  if (config.useESpeak) {
+    // Clean up espeak-ng
+    espeak_Terminate();
+  }
+}
+
+void loadModel(std::string modelPath, ModelSession &session) {
+
+  session.env = Ort::Env(OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING,
+                         instanceName.c_str());
+  session.env.DisableTelemetryEvents();
+
+  // Slows down performance by ~2x
+  // session.options.SetIntraOpNumThreads(1);
+
+  // Roughly doubles load time for no visible inference benefit
+  // session.options.SetGraphOptimizationLevel(
+  //     GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
+
+  session.options.SetGraphOptimizationLevel(
+      GraphOptimizationLevel::ORT_DISABLE_ALL);
+
+  // Slows down performance very slightly
+  // session.options.SetExecutionMode(ExecutionMode::ORT_PARALLEL);
+
+  session.options.DisableCpuMemArena();
+  session.options.DisableMemPattern();
+  session.options.DisableProfiling();
+
+  auto startTime = std::chrono::steady_clock::now();
+  session.onnx = Ort::Session(session.env, modelPath.c_str(), session.options);
+  auto endTime = std::chrono::steady_clock::now();
+  auto loadDuration = std::chrono::duration<double>(endTime - startTime);
+}
+
+// Load Onnx model and JSON config file
+void loadVoice(PiperConfig &config, std::string modelPath,
+               std::string modelConfigPath, Voice &voice,
+               std::optional<SpeakerId> &speakerId) {
+  std::ifstream modelConfigFile(modelConfigPath);
+  voice.configRoot = json::parse(modelConfigFile);
+
+  parsePhonemizeConfig(voice.configRoot, voice.phonemizeConfig);
+  parseSynthesisConfig(voice.configRoot, voice.synthesisConfig);
+  parseModelConfig(voice.configRoot, voice.modelConfig);
+
+  if (voice.modelConfig.numSpeakers > 1) {
+    // Multi-speaker model
+    if (speakerId) {
+      voice.synthesisConfig.speakerId = speakerId;
+    } else {
+      // Default speaker
+      voice.synthesisConfig.speakerId = 0;
+    }
+  }
+
+  loadModel(modelPath, voice.session);
+
+} /* loadVoice */
+
+// Phoneme ids to WAV audio
+void synthesize(std::vector<PhonemeId> &phonemeIds,
+                SynthesisConfig &synthesisConfig, ModelSession &session,
+                std::vector<int16_t> &audioBuffer, SynthesisResult &result) {
+  auto memoryInfo = Ort::MemoryInfo::CreateCpu(
+      OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
+
+  // Allocate
+  std::vector<int64_t> phonemeIdLengths{(int64_t)phonemeIds.size()};
+  std::vector<float> scales{synthesisConfig.noiseScale,
+                            synthesisConfig.lengthScale,
+                            synthesisConfig.noiseW};
+
+  std::vector<Ort::Value> inputTensors;
+  std::vector<int64_t> phonemeIdsShape{1, (int64_t)phonemeIds.size()};
+  inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
+      memoryInfo, phonemeIds.data(), phonemeIds.size(), phonemeIdsShape.data(),
+      phonemeIdsShape.size()));
+
+  std::vector<int64_t> phomemeIdLengthsShape{(int64_t)phonemeIdLengths.size()};
+  inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
+      memoryInfo, phonemeIdLengths.data(), phonemeIdLengths.size(),
+      phomemeIdLengthsShape.data(), phomemeIdLengthsShape.size()));
+
+  std::vector<int64_t> scalesShape{(int64_t)scales.size()};
+  inputTensors.push_back(
+      Ort::Value::CreateTensor<float>(memoryInfo, scales.data(), scales.size(),
+                                      scalesShape.data(), scalesShape.size()));
+
+  // Add speaker id.
+  // NOTE: These must be kept outside the "if" below to avoid being deallocated.
+  std::vector<int64_t> speakerId{
+      (int64_t)synthesisConfig.speakerId.value_or(0)};
+  std::vector<int64_t> speakerIdShape{(int64_t)speakerId.size()};
+
+  if (synthesisConfig.speakerId) {
+    inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
+        memoryInfo, speakerId.data(), speakerId.size(), speakerIdShape.data(),
+        speakerIdShape.size()));
+  }
+
+  // From export_onnx.py
+  std::array<const char *, 4> inputNames = {"input", "input_lengths", "scales",
+                                            "sid"};
+  std::array<const char *, 1> outputNames = {"output"};
+
+  // Infer
+  auto startTime = std::chrono::steady_clock::now();
+  auto outputTensors = session.onnx.Run(
+      Ort::RunOptions{nullptr}, inputNames.data(), inputTensors.data(),
+      inputTensors.size(), outputNames.data(), outputNames.size());
+  auto endTime = std::chrono::steady_clock::now();
+
+  if ((outputTensors.size() != 1) || (!outputTensors.front().IsTensor())) {
+    throw std::runtime_error("Invalid output tensors");
+  }
+  auto inferDuration = std::chrono::duration<double>(endTime - startTime);
+  result.inferSeconds = inferDuration.count();
+
+  const float *audio = outputTensors.front().GetTensorData<float>();
+  auto audioShape =
+      outputTensors.front().GetTensorTypeAndShapeInfo().GetShape();
+  int64_t audioCount = audioShape[audioShape.size() - 1];
+
+  result.audioSeconds = (double)audioCount / (double)synthesisConfig.sampleRate;
+  result.realTimeFactor = 0.0;
+  if (result.audioSeconds > 0) {
+    result.realTimeFactor = result.inferSeconds / result.audioSeconds;
+  }
+
+  // Get max audio value for scaling
+  float maxAudioValue = 0.01f;
+  for (int64_t i = 0; i < audioCount; i++) {
+    float audioValue = abs(audio[i]);
+    if (audioValue > maxAudioValue) {
+      maxAudioValue = audioValue;
+    }
+  }
+
+  // We know the size up front
+  audioBuffer.reserve(audioCount);
+
+  // Scale audio to fill range and convert to int16
+  float audioScale = (MAX_WAV_VALUE / std::max(0.01f, maxAudioValue));
+  for (int64_t i = 0; i < audioCount; i++) {
+    int16_t intAudioValue = static_cast<int16_t>(
+        std::clamp(audio[i] * audioScale,
+                   static_cast<float>(std::numeric_limits<int16_t>::min()),
+                   static_cast<float>(std::numeric_limits<int16_t>::max())));
+
+    audioBuffer.push_back(intAudioValue);
+  }
+
+  // Clean up
+  for (std::size_t i = 0; i < outputTensors.size(); i++) {
+    Ort::detail::OrtRelease(outputTensors[i].release());
+  }
+
+  for (std::size_t i = 0; i < inputTensors.size(); i++) {
+    Ort::detail::OrtRelease(inputTensors[i].release());
+  }
+}
+
+// ----------------------------------------------------------------------------
+
+// Phonemize text and synthesize audio
+void textToAudio(PiperConfig &config, Voice &voice, std::string text,
+                 std::vector<int16_t> &audioBuffer, SynthesisResult &result,
+                 const std::function<void()> &audioCallback) {
+
+  std::size_t sentenceSilenceSamples = 0;
+  if (voice.synthesisConfig.sentenceSilenceSeconds > 0) {
+    sentenceSilenceSamples = (std::size_t)(
+        voice.synthesisConfig.sentenceSilenceSeconds *
+        voice.synthesisConfig.sampleRate * voice.synthesisConfig.channels);
+  }
+
+  // Phonemes for each sentence
+  std::vector<std::vector<Phoneme>> phonemes;
+
+  if (voice.phonemizeConfig.phonemeType == eSpeakPhonemes) {
+    // Use espeak-ng for phonemization
+    eSpeakPhonemeConfig eSpeakConfig;
+    eSpeakConfig.voice = voice.phonemizeConfig.eSpeak->voice;
+    phonemize_eSpeak(text, eSpeakConfig, phonemes);
+  } else {
+    // Use UTF-8 codepoints as "phonemes"
+    CodepointsPhonemeConfig codepointsConfig;
+    phonemize_codepoints(text, codepointsConfig, phonemes);
+  }
+
+  // Synthesize each sentence independently.
+  std::vector<PhonemeId> phonemeIds;
+  std::map<Phoneme, std::size_t> missingPhonemes;
+  for (auto phonemesIter = phonemes.begin(); phonemesIter != phonemes.end();
+       ++phonemesIter) {
+    std::vector<Phoneme> &sentencePhonemes = *phonemesIter;
+    SynthesisResult sentenceResult;
+
+    PhonemeIdConfig idConfig;
+    if (voice.phonemizeConfig.phonemeType == TextPhonemes) {
+      auto &language = voice.phonemizeConfig.eSpeak->voice;
+      if (DEFAULT_ALPHABET.count(language) < 1) {
+        throw std::runtime_error(
+            "Text phoneme language for voice is not supported");
+      }
+
+      // Use alphabet for language
+      idConfig.phonemeIdMap =
+          std::make_shared<PhonemeIdMap>(DEFAULT_ALPHABET[language]);
+    }
+
+    // phonemes -> ids
+    phonemes_to_ids(sentencePhonemes, idConfig, phonemeIds, missingPhonemes);
+
+    // ids -> audio
+    synthesize(phonemeIds, voice.synthesisConfig, voice.session, audioBuffer,
+               sentenceResult);
+
+    // Add end of sentence silence
+    if (sentenceSilenceSamples > 0) {
+      for (std::size_t i = 0; i < sentenceSilenceSamples; i++) {
+        audioBuffer.push_back(0);
+      }
+    }
+
+    if (audioCallback) {
+      // Call back must copy audio since it is cleared afterwards.
+      audioCallback();
+      audioBuffer.clear();
+    }
+
+    result.audioSeconds += sentenceResult.audioSeconds;
+    result.inferSeconds += sentenceResult.inferSeconds;
+
+    phonemeIds.clear();
+  }
+
+  if (result.audioSeconds > 0) {
+    result.realTimeFactor = result.inferSeconds / result.audioSeconds;
+  }
+
+} /* textToAudio */
+
+// Phonemize text and synthesize audio to WAV file
+void textToWavFile(PiperConfig &config, Voice &voice, std::string text,
+                   std::ostream &audioFile, SynthesisResult &result) {
+
+  std::vector<int16_t> audioBuffer;
+  textToAudio(config, voice, text, audioBuffer, result, NULL);
+
+  // Write WAV
+  auto synthesisConfig = voice.synthesisConfig;
+  writeWavHeader(synthesisConfig.sampleRate, synthesisConfig.sampleWidth,
+                 synthesisConfig.channels, (int32_t)audioBuffer.size(),
+                 audioFile);
+
+  audioFile.write((const char *)audioBuffer.data(),
+                  sizeof(int16_t) * audioBuffer.size());
+
+} /* textToWavFile */
+
+} // namespace piper
--- a/src/cpp/piper.hpp
+++ b/src/cpp/piper.hpp
@ -1,148 +1,105 @@
 #ifndef PIPER_H_
 #define PIPER_H_

-#include <filesystem>
-#include <iostream>
+#include <functional>
+#include <fstream>
+#include <optional>
 #include <string>
 #include <vector>

-#include "json.hpp"
-#include <espeak-ng/speak_lib.h>
+#include <onnxruntime_cxx_api.h>
+#include <phoneme_ids.hpp>
+#include <phonemize.hpp>

-#include "config.hpp"
-#include "model.hpp"
-#include "phonemize.hpp"
-#include "synthesize.hpp"
-#include "wavfile.hpp"
+#include "json.hpp"

 using json = nlohmann::json;

 namespace piper {

-struct Voice {
-  json configRoot;
-  PhonemizeConfig phonemizeConfig;
-  SynthesisConfig synthesisConfig;
-  ModelConfig modelConfig;
-  ModelSession session;
+typedef int64_t SpeakerId;
+
+struct eSpeakConfig {
+  std::string voice = "en-us";
 };

-void initialize(std::filesystem::path cwd) {
-  string dataPath;
+struct PiperConfig {
+  std::string eSpeakDataPath;
+  bool useESpeak = true;
+};

-  auto cwdDataPath = std::filesystem::absolute(cwd.append("espeak-ng-data"));
-  if (std::filesystem::is_directory(cwdDataPath)) {
-    dataPath = cwdDataPath.string();
-  }
+enum PhonemeType { eSpeakPhonemes, TextPhonemes };

-	cerr << "dataPath: " << dataPath << endl;
+struct PhonemizeConfig {
+  PhonemeType phonemeType = eSpeakPhonemes;
+  std::optional<std::map<Phoneme, std::vector<Phoneme>>> phonemeMap;
+  std::map<Phoneme, std::vector<PhonemeId>> phonemeIdMap;

-  // Set up espeak-ng for calling espeak_TextToPhonemes
-  int result = espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS,
-                                 /*buflength*/ 0,
-                                 /*path*/ dataPath.c_str(),
-                                 /*options*/ 0);
-  if (result < 0) {
-    throw runtime_error("Failed to initialize eSpeak-ng");
-  }
-}
+  PhonemeId idPad = 0; // padding (optionally interspersed)
+  PhonemeId idBos = 1; // beginning of sentence
+  PhonemeId idEos = 2; // end of sentence
+  bool interspersePad = true;

-void terminate() {
-  // Clean up espeak-ng
-  espeak_Terminate();
-}
+  std::optional<eSpeakConfig> eSpeak;
+};

-// Load Onnx model and JSON config file
-void loadVoice(string modelPath, string modelConfigPath, Voice &voice,
-               optional<SpeakerId> &speakerId) {
-  ifstream modelConfigFile(modelConfigPath.c_str());
-  voice.configRoot = json::parse(modelConfigFile);
+struct SynthesisConfig {
+  float noiseScale = 0.667f;
+  float lengthScale = 1.0f;
+  float noiseW = 0.8f;
+  int sampleRate = 22050;
+  int sampleWidth = 2; // 16-bit
+  int channels = 1;    // mono
+  std::optional<SpeakerId> speakerId;
+  float sentenceSilenceSeconds = 0.2f;
+};

-  parsePhonemizeConfig(voice.configRoot, voice.phonemizeConfig);
-  parseSynthesisConfig(voice.configRoot, voice.synthesisConfig);
-  parseModelConfig(voice.configRoot, voice.modelConfig);
+struct ModelConfig {
+  int numSpeakers;
+};

-  if (voice.modelConfig.numSpeakers > 1) {
-    // Multispeaker model
-    if (speakerId) {
-      voice.synthesisConfig.speakerId = speakerId;
-    } else {
-      // Default speaker
-      voice.synthesisConfig.speakerId = 0;
-    }
-  }
+struct ModelSession {
+  Ort::Session onnx;
+  Ort::AllocatorWithDefaultOptions allocator;
+  Ort::SessionOptions options;
+  Ort::Env env;

-  loadModel(modelPath, voice.session);
+  ModelSession() : onnx(nullptr){};
+};

-} /* loadVoice */
+struct SynthesisResult {
+  double inferSeconds;
+  double audioSeconds;
+  double realTimeFactor;
+};

-// Phonemize text and synthesize audio
-void textToAudio(Voice &voice, string text, vector<int16_t> &audioBuffer,
-                 SynthesisResult &result,
-                 const function<void()> &audioCallback) {
-
-  size_t sentenceSilenceSamples = 0;
-  if (voice.synthesisConfig.sentenceSilenceSeconds > 0) {
-    sentenceSilenceSamples = (size_t)(
-        voice.synthesisConfig.sentenceSilenceSeconds *
-        voice.synthesisConfig.sampleRate * voice.synthesisConfig.channels);
-  }
-
-  // Phonemes for each sentence
-  vector<vector<Phoneme>> phonemes;
-  phonemize(text, voice.phonemizeConfig, phonemes);
-
-  vector<PhonemeId> phonemeIds;
-  for (auto phonemesIter = phonemes.begin(); phonemesIter != phonemes.end();
-       ++phonemesIter) {
-    vector<Phoneme> &sentencePhonemes = *phonemesIter;
-    SynthesisResult sentenceResult;
-    phonemes2ids(sentencePhonemes, voice.phonemizeConfig, phonemeIds);
-    synthesize(phonemeIds, voice.synthesisConfig, voice.session, audioBuffer,
-               sentenceResult);
-
-    // Add end of sentence silence
-    if (sentenceSilenceSamples > 0) {
-      for (size_t i = 0; i < sentenceSilenceSamples; i++) {
-        audioBuffer.push_back(0);
-      }
-    }
-
-    if (audioCallback) {
-      // Call back must copy audio since it is cleared afterwards.
-      audioCallback();
-      audioBuffer.clear();
-    }
-
-    result.audioSeconds += sentenceResult.audioSeconds;
-    result.inferSeconds += sentenceResult.inferSeconds;
-
-    phonemeIds.clear();
-  }
-
-  if (result.audioSeconds > 0) {
-    result.realTimeFactor = result.inferSeconds / result.audioSeconds;
-  }
-
-} /* textToAudio */
+struct Voice {
+  json configRoot;
+  PhonemizeConfig phonemizeConfig;
+  SynthesisConfig synthesisConfig;
+  ModelConfig modelConfig;
+  ModelSession session;
+};

-// Phonemize text and synthesize audio to WAV file
-void textToWavFile(Voice &voice, string text, ostream &audioFile,
-                   SynthesisResult &result) {
+// Must be called before using textTo* functions
+void initialize(PiperConfig &config);

-  vector<int16_t> audioBuffer;
-  textToAudio(voice, text, audioBuffer, result, NULL);
+// Clean up
+void terminate(PiperConfig &config);

-  // Write WAV
-  auto synthesisConfig = voice.synthesisConfig;
-  writeWavHeader(synthesisConfig.sampleRate, synthesisConfig.sampleWidth,
-                 synthesisConfig.channels, (int32_t)audioBuffer.size(),
-                 audioFile);
+// Load Onnx model and JSON config file
+void loadVoice(PiperConfig &config, std::string modelPath,
+               std::string modelConfigPath, Voice &voice,
+               std::optional<SpeakerId> &speakerId);

-  audioFile.write((const char *)audioBuffer.data(),
-                  sizeof(int16_t) * audioBuffer.size());
+// Phonemize text and synthesize audio
+void textToAudio(PiperConfig &config, Voice &voice, std::string text,
+                 std::vector<int16_t> &audioBuffer, SynthesisResult &result,
+                 const std::function<void()> &audioCallback);

-} /* textToWavFile */
+// Phonemize text and synthesize audio to WAV file
+void textToWavFile(PiperConfig &config, Voice &voice, std::string text,
+                   std::ostream &audioFile, SynthesisResult &result);

 } // namespace piper

--- a/src/cpp/synthesize.hpp
+++ b/src/cpp/synthesize.hpp
@ -1,130 +0,0 @@
-#ifndef SYNTHESIZE_H_
-#define SYNTHESIZE_H_
-
-#include <array>
-#include <chrono>
-#include <limits>
-#include <memory>
-#include <vector>
-
-#include <onnxruntime_cxx_api.h>
-
-#include "config.hpp"
-#include "model.hpp"
-
-using namespace std;
-
-namespace piper {
-
-// Maximum value for 16-bit signed WAV sample
-const float MAX_WAV_VALUE = 32767.0f;
-
-struct SynthesisResult {
-  double inferSeconds;
-  double audioSeconds;
-  double realTimeFactor;
-};
-
-// Phoneme ids to WAV audio
-void synthesize(vector<PhonemeId> &phonemeIds, SynthesisConfig &synthesisConfig,
-                ModelSession &session, vector<int16_t> &audioBuffer,
-                SynthesisResult &result) {
-  auto memoryInfo = Ort::MemoryInfo::CreateCpu(
-      OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
-
-  // Allocate
-  vector<int64_t> phonemeIdLengths{(int64_t)phonemeIds.size()};
-  vector<float> scales{synthesisConfig.noiseScale, synthesisConfig.lengthScale,
-                       synthesisConfig.noiseW};
-
-  vector<Ort::Value> inputTensors;
-  vector<int64_t> phonemeIdsShape{1, (int64_t)phonemeIds.size()};
-  inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
-      memoryInfo, phonemeIds.data(), phonemeIds.size(), phonemeIdsShape.data(),
-      phonemeIdsShape.size()));
-
-  vector<int64_t> phomemeIdLengthsShape{(int64_t)phonemeIdLengths.size()};
-  inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
-      memoryInfo, phonemeIdLengths.data(), phonemeIdLengths.size(),
-      phomemeIdLengthsShape.data(), phomemeIdLengthsShape.size()));
-
-  vector<int64_t> scalesShape{(int64_t)scales.size()};
-  inputTensors.push_back(
-      Ort::Value::CreateTensor<float>(memoryInfo, scales.data(), scales.size(),
-                                      scalesShape.data(), scalesShape.size()));
-
-  // Add speaker id.
-  // NOTE: These must be kept outside the "if" below to avoid being deallocated.
-  vector<int64_t> speakerId{(int64_t)synthesisConfig.speakerId.value_or(0)};
-  vector<int64_t> speakerIdShape{(int64_t)speakerId.size()};
-
-  if (synthesisConfig.speakerId) {
-    inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
-        memoryInfo, speakerId.data(), speakerId.size(), speakerIdShape.data(),
-        speakerIdShape.size()));
-  }
-
-  // From export_onnx.py
-  array<const char *, 4> inputNames = {"input", "input_lengths", "scales",
-                                       "sid"};
-  array<const char *, 1> outputNames = {"output"};
-
-  // Infer
-  auto startTime = chrono::steady_clock::now();
-  auto outputTensors = session.onnx.Run(
-      Ort::RunOptions{nullptr}, inputNames.data(), inputTensors.data(),
-      inputTensors.size(), outputNames.data(), outputNames.size());
-  auto endTime = chrono::steady_clock::now();
-
-  if ((outputTensors.size() != 1) || (!outputTensors.front().IsTensor())) {
-    throw runtime_error("Invalid output tensors");
-  }
-  auto inferDuration = chrono::duration<double>(endTime - startTime);
-  result.inferSeconds = inferDuration.count();
-
-  const float *audio = outputTensors.front().GetTensorData<float>();
-  auto audioShape =
-      outputTensors.front().GetTensorTypeAndShapeInfo().GetShape();
-  int64_t audioCount = audioShape[audioShape.size() - 1];
-
-  result.audioSeconds = (double)audioCount / (double)synthesisConfig.sampleRate;
-  result.realTimeFactor = 0.0;
-  if (result.audioSeconds > 0) {
-    result.realTimeFactor = result.inferSeconds / result.audioSeconds;
-  }
-
-  // Get max audio value for scaling
-  float maxAudioValue = 0.01f;
-  for (int64_t i = 0; i < audioCount; i++) {
-    float audioValue = abs(audio[i]);
-    if (audioValue > maxAudioValue) {
-      maxAudioValue = audioValue;
-    }
-  }
-
-  // We know the size up front
-  audioBuffer.reserve(audioCount);
-
-  // Scale audio to fill range and convert to int16
-  float audioScale = (MAX_WAV_VALUE / max(0.01f, maxAudioValue));
-  for (int64_t i = 0; i < audioCount; i++) {
-    int16_t intAudioValue = static_cast<int16_t>(
-        clamp(audio[i] * audioScale,
-              static_cast<float>(numeric_limits<int16_t>::min()),
-              static_cast<float>(numeric_limits<int16_t>::max())));
-
-    audioBuffer.push_back(intAudioValue);
-  }
-
-  // Clean up
-  for (size_t i = 0; i < outputTensors.size(); i++) {
-    Ort::detail::OrtRelease(outputTensors[i].release());
-  }
-
-  for (size_t i = 0; i < inputTensors.size(); i++) {
-    Ort::detail::OrtRelease(inputTensors[i].release());
-  }
-}
-} // namespace piper
-
-#endif // SYNTHESIZE_H_
--- a/src/cpp/wavfile.hpp
+++ b/src/cpp/wavfile.hpp
@ -3,8 +3,6 @@

 #include <iostream>

-namespace piper {
-
 struct WavHeader {
  uint8_t RIFF[4] = {'R', 'I', 'F', 'F'};
  uint32_t chunkSize;
@ -14,7 +12,7 @@ struct WavHeader {
  uint8_t fmt[4] = {'f', 'm', 't', ' '};
  uint32_t fmtSize = 16;    // bytes
  uint16_t audioFormat = 1; // PCM
-  uint16_t numChannels; // mono
+  uint16_t numChannels;     // mono
  uint32_t sampleRate;      // Hertz
  uint32_t bytesPerSec;     // sampleRate * sampleWidth
  uint16_t blockAlign = 2;  // 16-bit mono
@ -39,6 +37,4 @@ void writeWavHeader(int sampleRate, int sampleWidth, int channels,

 } /* writeWavHeader */

-} // namespace piper
-
 #endif // WAVFILE_H_