First working version with libpiper_phonemize

1 year ago · 810fad44cf
parent 7d27863b48
commit 810fad44cf
11 changed files with 503 additions and 776 deletions
--- a/16
+++ b/16
@ -1,16 +1,12 @@
-.PHONY: release debug clean test
+.PHONY: piper clean test
-release:
+LIB_DIR := lib/Linux-$(shell uname -m)
 	mkdir -p build
 	cd build && cmake ../src/cpp -DCMAKE_BUILD_TYPE=Release && make
-no-pcaudio:
+piper:
 	mkdir -p build
-	cd build && cmake ../src/cpp -DCMAKE_BUILD_TYPE=Release -DUSE_PCAUDIO=OFF && make
+	cd build && cmake ../src/cpp -DCMAKE_BUILD_TYPE=Release && make
-
+	cp -aR $(LIB_DIR)/piper_phonemize/espeak-ng-data $(LIB_DIR)/piper_phonemize/lib/*.so* build/
-debug:
+	cp -a $(LIB_DIR)/onnxruntime/lib/*.so* build/
 	mkdir -p build
 	cd build && cmake ../src/cpp -DCMAKE_BUILD_TYPE=Debug && make
 clean:
 	rm -rf build/ dist/
--- a/lib/espeak-ng-1.52-patched.tar.gz
+++ b/lib/espeak-ng-1.52-patched.tar.gz
--- a/src/cpp/CMakeLists.txt
+++ b/src/cpp/CMakeLists.txt
@ -4,47 +4,31 @@ include(CheckIncludeFileCXX)
 project(piper C CXX)
-set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
-ADD_EXECUTABLE(piper main.cpp)
+ADD_EXECUTABLE(piper main.cpp piper.cpp)
 string(APPEND CMAKE_CXX_FLAGS " -Wall -Wextra -Wl,-rpath,'$ORIGIN'")
 string(APPEND CMAKE_C_FLAGS " -Wall -Wextra")
-find_package(PkgConfig)
+set(PIPER_PHONEMIZE_ROOTDIR ${CMAKE_CURRENT_LIST_DIR}/../../lib/${CMAKE_HOST_SYSTEM_NAME}-${CMAKE_HOST_SYSTEM_PROCESSOR}/piper_phonemize)
-pkg_check_modules(ESPEAK_NG REQUIRED espeak-ng<2)
+set(ONNXRUNTIME_ROOTDIR ${CMAKE_CURRENT_LIST_DIR}/../../lib/${CMAKE_HOST_SYSTEM_NAME}-${CMAKE_HOST_SYSTEM_PROCESSOR}/onnxruntime)
 # https://github.com/espeak-ng/pcaudiolib
 check_include_file_cxx("pcaudiolib/audio.h" PCAUDIO_INCLUDE_FOUND)
 if(PCAUDIO_INCLUDE_FOUND)
  option(USE_PCAUDIO "Build with pcaudiolib" ON)
  if(USE_PCAUDIO)
    target_compile_definitions(piper PUBLIC HAVE_PCAUDIO)
    set(PCAUDIO_LIBRARIES "pcaudio")
  endif()
 endif()
 set(ONNXRUNTIME_ROOTDIR ${CMAKE_CURRENT_LIST_DIR}/../../lib/${CMAKE_HOST_SYSTEM_NAME}-${CMAKE_HOST_SYSTEM_PROCESSOR})
 target_link_libraries(piper
                      piper_phonemize
                      espeak-ng
                      onnxruntime
-                      pthread
+                      pthread)
                      ${ESPEAK_NG_LIBRARIES}
                      ${PCAUDIO_LIBRARIES})
 if(NOT APPLE)
  target_link_libraries(-static-libgcc -static-libstdc++)
 endif()
 target_link_directories(piper PUBLIC
-                        ${ESPEAK_NG_LIBRARY_DIRS}
+                        ${PIPER_PHONEMIZE_ROOTDIR}/lib
                        ${ONNXRUNTIME_ROOTDIR}/lib)
 target_include_directories(piper PUBLIC
-                           ${ONNXRUNTIME_ROOTDIR}/include
+                           ${PIPER_PHONEMIZE_ROOTDIR}/include
-                           ${ESPEAK_NG_INCLUDE_DIRS})
+                           ${ONNXRUNTIME_ROOTDIR}/include)
 target_compile_options(piper PUBLIC
                       ${ESPEAK_NG_CFLAGS_OTHER})
--- a/src/cpp/config.hpp
+++ b/src/cpp/config.hpp
@ -1,155 +0,0 @@
 #ifndef CONFIG_H_
 #define CONFIG_H_
 #include <filesystem>
 #include <map>
 #include <optional>
 #include <set>
 #include <stdexcept>
 #include <string>
 #include <vector>
 #include "json.hpp"
 #include "utf8.h"
 using namespace std;
 using json = nlohmann::json;
 namespace piper {
 typedef char32_t Phoneme;
 typedef int64_t PhonemeId;
 typedef int64_t SpeakerId;
 const string DefaultVoice = "en-us";
 enum eSpeakMode { Text, TextWithPhonemes, SSML };
 struct eSpeakConfig {
  string voice = DefaultVoice;
  eSpeakMode mode = Text;
  // Characters that eSpeak uses to break apart paragraphs/sentences
  set<Phoneme> clauseBreakers{U'.', U'?', U'!', U',', U';', U':'};
  Phoneme fullStop = U'.';
  Phoneme comma = U',';
  Phoneme question = U'?';
  Phoneme exclamation = U'!';
 };
 struct PhonemizeConfig {
  optional<map<Phoneme, vector<Phoneme>>> phonemeMap;
  map<Phoneme, vector<PhonemeId>> phonemeIdMap;
  PhonemeId idPad = 0; // padding (optionally interspersed)
  PhonemeId idBos = 1; // beginning of sentence
  PhonemeId idEos = 2; // end of sentence
  bool interspersePad = true;
  optional<eSpeakConfig> eSpeak;
 };
 struct SynthesisConfig {
  float noiseScale = 0.667f;
  float lengthScale = 1.0f;
  float noiseW = 0.8f;
  int sampleRate = 22050;
  int sampleWidth = 2; // 16-bit
  int channels = 1;    // mono
  optional<SpeakerId> speakerId;
  float sentenceSilenceSeconds = 0.2f;
 };
 struct ModelConfig {
  int numSpeakers;
 };
 bool isSingleCodepoint(string s) {
  return utf8::distance(s.begin(), s.end()) == 1;
 }
 Phoneme getCodepoint(string s) {
  utf8::iterator character_iter(s.begin(), s.begin(), s.end());
  return *character_iter;
 }
 void parsePhonemizeConfig(json &configRoot, PhonemizeConfig &phonemizeConfig) {
  if (configRoot.contains("espeak")) {
    if (!phonemizeConfig.eSpeak) {
      phonemizeConfig.eSpeak.emplace();
    }
    auto espeakValue = configRoot["espeak"];
    if (espeakValue.contains("voice")) {
      phonemizeConfig.eSpeak->voice = espeakValue["voice"].get<string>();
    }
  }
  // phoneme to [phoneme] map
  if (configRoot.contains("phoneme_map")) {
    if (!phonemizeConfig.phonemeMap) {
      phonemizeConfig.phonemeMap.emplace();
    }
    auto phonemeMapValue = configRoot["phoneme_map"];
    for (auto &fromPhonemeItem : phonemeMapValue.items()) {
      string fromPhoneme = fromPhonemeItem.key();
      if (!isSingleCodepoint(fromPhoneme)) {
        throw runtime_error("Phonemes must be one codepoint (phoneme map)");
      }
      auto fromCodepoint = getCodepoint(fromPhoneme);
      for (auto &toPhonemeValue : fromPhonemeItem.value()) {
        string toPhoneme = toPhonemeValue.get<string>();
        if (!isSingleCodepoint(toPhoneme)) {
          throw runtime_error("Phonemes must be one codepoint (phoneme map)");
        }
        auto toCodepoint = getCodepoint(toPhoneme);
        (*phonemizeConfig.phonemeMap)[fromCodepoint].push_back(toCodepoint);
      }
    }
  }
  // phoneme to [id] map
  if (configRoot.contains("phoneme_id_map")) {
    auto phonemeIdMapValue = configRoot["phoneme_id_map"];
    for (auto &fromPhonemeItem : phonemeIdMapValue.items()) {
      string fromPhoneme = fromPhonemeItem.key();
      if (!isSingleCodepoint(fromPhoneme)) {
        throw runtime_error("Phonemes must be one codepoint (phoneme id map)");
      }
      auto fromCodepoint = getCodepoint(fromPhoneme);
      for (auto &toIdValue : fromPhonemeItem.value()) {
        PhonemeId toId = toIdValue.get<PhonemeId>();
        phonemizeConfig.phonemeIdMap[fromCodepoint].push_back(toId);
      }
    }
  }
 } /* parsePhonemizeConfig */
 void parseSynthesisConfig(json &configRoot, SynthesisConfig &synthesisConfig) {
  if (configRoot.contains("audio")) {
    auto audioValue = configRoot["audio"];
    if (audioValue.contains("sample_rate")) {
      // Default sample rate is 22050 Hz
      synthesisConfig.sampleRate = audioValue.value("sample_rate", 22050);
    }
  }
 } /* parseSynthesisConfig */
 void parseModelConfig(json &configRoot, ModelConfig &modelConfig) {
  modelConfig.numSpeakers = configRoot["num_speakers"].get<SpeakerId>();
 } /* parseModelConfig */
 } // namespace piper
 #endif // CONFIG_H_
--- a/src/cpp/main.cpp
+++ b/src/cpp/main.cpp
@ -2,6 +2,7 @@
 #include <condition_variable>
 #include <filesystem>
 #include <fstream>
 #include <functional>
 #include <iostream>
 #include <mutex>
 #include <sstream>
@ -10,11 +11,6 @@
 #include <thread>
 #include <vector>
 #ifdef HAVE_PCAUDIO
 // https://github.com/espeak-ng/pcaudiolib
 #include <pcaudiolib/audio.h>
 #endif
 #ifdef _MSC_VER
 #define WIN32_LEAN_AND_MEAN
 #define NOMINMAX
@ -29,19 +25,13 @@
 using namespace std;
-enum OutputType {
+enum OutputType { OUTPUT_FILE, OUTPUT_DIRECTORY, OUTPUT_STDOUT, OUTPUT_RAW };
  OUTPUT_FILE,
  OUTPUT_DIRECTORY,
  OUTPUT_STDOUT,
  OUTPUT_PLAY,
  OUTPUT_RAW
 };
 struct RunConfig {
  filesystem::path modelPath;
  filesystem::path modelConfigPath;
-  OutputType outputType = OUTPUT_PLAY;
+  OutputType outputType = OUTPUT_DIRECTORY;
-  optional<filesystem::path> outputPath;
+  optional<filesystem::path> outputPath = filesystem::path(".");
  optional<piper::SpeakerId> speakerId;
  optional<float> noiseScale;
  optional<float> lengthScale;
@ -53,12 +43,6 @@ void rawOutputProc(vector<int16_t> &sharedAudioBuffer, mutex &mutAudio,
                   condition_variable &cvAudio, bool &audioReady,
                   bool &audioFinished);
 #ifdef HAVE_PCAUDIO
 void playProc(audio_object *my_audio, vector<int16_t> &sharedAudioBuffer,
              mutex &mutAudio, condition_variable &cvAudio, bool &audioReady,
              bool &audioFinished);
 #endif
 int main(int argc, char *argv[]) {
  RunConfig runConfig;
  parseArgs(argc, argv, runConfig);
@ -66,7 +50,7 @@ int main(int argc, char *argv[]) {
  // NOTE: This won't work for Windows (need GetModuleFileName)
 #ifdef _MSC_VER
  auto exePath = []() {
-    wchar_t moduleFileName[MAX_PATH] = { 0 };
+    wchar_t moduleFileName[MAX_PATH] = {0};
    GetModuleFileNameW(nullptr, moduleFileName, std::size(moduleFileName));
    return filesystem::path(moduleFileName);
  }();
@ -81,17 +65,22 @@ int main(int argc, char *argv[]) {
 #else
  auto exePath = filesystem::canonical("/proc/self/exe");
 #endif
-#endif
+
-  piper::initialize(exePath.parent_path());
+  piper::PiperConfig piperConfig;
  piperConfig.eSpeakDataPath =
      std::filesystem::absolute(exePath.parent_path().append("espeak-ng-data"))
          .string();
  piper::Voice voice;
  auto startTime = chrono::steady_clock::now();
-  loadVoice(runConfig.modelPath.string(), runConfig.modelConfigPath.string(),
+  loadVoice(piperConfig, runConfig.modelPath.string(),
-            voice, runConfig.speakerId);
+            runConfig.modelConfigPath.string(), voice, runConfig.speakerId);
  auto endTime = chrono::steady_clock::now();
  auto loadSeconds = chrono::duration<double>(endTime - startTime).count();
  cerr << "Load time: " << loadSeconds << " sec" << endl;
  piper::initialize(piperConfig);
  // Scales
  if (runConfig.noiseScale) {
    voice.synthesisConfig.noiseScale = runConfig.noiseScale.value();
@ -105,33 +94,6 @@ int main(int argc, char *argv[]) {
    voice.synthesisConfig.noiseW = runConfig.noiseW.value();
  }
 #ifdef HAVE_PCAUDIO
  audio_object *my_audio = nullptr;
  if (runConfig.outputType == OUTPUT_PLAY) {
    // Output audio to the default audio device
    my_audio = create_audio_device_object(NULL, "piper", "Text-to-Speech");
    // TODO: Support 32-bit sample widths
    auto audioFormat = AUDIO_OBJECT_FORMAT_S16LE;
    int error = audio_object_open(my_audio, audioFormat,
                                  voice.synthesisConfig.sampleRate,
                                  voice.synthesisConfig.channels);
    if (error != 0) {
      throw runtime_error(audio_object_strerror(my_audio, error));
    }
  }
 #else
  if (runConfig.outputType == OUTPUT_PLAY) {
    // Cannot play audio directly
    cerr << "WARNING: Piper was not compiled with pcaudiolib. Output audio "
            "will be written to the current directory."
         << endl;
    runConfig.outputType = OUTPUT_DIRECTORY;
    runConfig.outputPath = filesystem::path(".");
  }
 #endif
  if (runConfig.outputType == OUTPUT_DIRECTORY) {
    runConfig.outputPath = filesystem::absolute(runConfig.outputPath.value());
    cerr << "Output directory: " << runConfig.outputPath.value() << endl;
@ -155,7 +117,7 @@ int main(int argc, char *argv[]) {
      // Output audio to automatically-named WAV file in a directory
      ofstream audioFile(outputPath.string(), ios::binary);
-      piper::textToWavFile(voice, line, audioFile, result);
+      piper::textToWavFile(piperConfig, voice, line, audioFile, result);
      cout << outputPath.string() << endl;
    } else if (runConfig.outputType == OUTPUT_FILE) {
      // Read all of standard input before synthesizing.
@ -168,10 +130,10 @@ int main(int argc, char *argv[]) {
      // Output audio to WAV file
      ofstream audioFile(runConfig.outputPath.value().string(), ios::binary);
-      piper::textToWavFile(voice, text.str(), audioFile, result);
+      piper::textToWavFile(piperConfig, voice, text.str(), audioFile, result);
    } else if (runConfig.outputType == OUTPUT_STDOUT) {
      // Output WAV to stdout
-      piper::textToWavFile(voice, line, cout, result);
+      piper::textToWavFile(piperConfig, voice, line, cout, result);
    } else if (runConfig.outputType == OUTPUT_RAW) {
      // Raw output to stdout
      mutex mutAudio;
@ -195,7 +157,8 @@ int main(int argc, char *argv[]) {
          cvAudio.notify_one();
        }
      };
-      piper::textToAudio(voice, line, audioBuffer, result, audioCallback);
+      piper::textToAudio(piperConfig, voice, line, audioBuffer, result,
                         audioCallback);
      // Signal thread that there is no more audio
      {
@ -208,45 +171,6 @@ int main(int argc, char *argv[]) {
      // Wait for audio output to finish
      cerr << "Waiting for audio..." << endl;
      rawOutputThread.join();
    } else if (runConfig.outputType == OUTPUT_PLAY) {
 #ifdef HAVE_PCAUDIO
      mutex mutAudio;
      condition_variable cvAudio;
      bool audioReady = false;
      bool audioFinished = false;
      vector<int16_t> audioBuffer;
      vector<int16_t> sharedAudioBuffer;
      thread playThread(playProc, my_audio, ref(sharedAudioBuffer),
                        ref(mutAudio), ref(cvAudio), ref(audioReady),
                        ref(audioFinished));
      auto audioCallback = [&audioBuffer, &sharedAudioBuffer, &mutAudio,
                            &cvAudio, &audioReady]() {
        // Signal thread that audio is ready
        {
          unique_lock lockAudio(mutAudio);
          copy(audioBuffer.begin(), audioBuffer.end(),
               back_inserter(sharedAudioBuffer));
          audioReady = true;
          cvAudio.notify_one();
        }
      };
      piper::textToAudio(voice, line, audioBuffer, result, audioCallback);
      // Signal thread that there is no more audio
      {
        unique_lock lockAudio(mutAudio);
        audioReady = true;
        audioFinished = true;
        cvAudio.notify_one();
      }
      // Wait for audio output to finish
      cerr << "Waiting for audio..." << endl;
      playThread.join();
 #else
      throw runtime_error("Cannot play audio! Not compiled with pcaudiolib.");
 #endif
    }
    cerr << "Real-time factor: " << result.realTimeFactor
@ -254,13 +178,7 @@ int main(int argc, char *argv[]) {
         << " sec, audio=" << result.audioSeconds << " sec)" << endl;
  }
-  piper::terminate();
+  piper::terminate(piperConfig);
 #ifdef HAVE_PCAUDIO
  audio_object_close(my_audio);
  audio_object_destroy(my_audio);
  my_audio = nullptr;
 #endif
  return EXIT_SUCCESS;
 }
@ -296,43 +214,6 @@ void rawOutputProc(vector<int16_t> &sharedAudioBuffer, mutex &mutAudio,
 } // rawOutputProc
 #ifdef HAVE_PCAUDIO
 void playProc(audio_object *my_audio, vector<int16_t> &sharedAudioBuffer,
              mutex &mutAudio, condition_variable &cvAudio, bool &audioReady,
              bool &audioFinished) {
  vector<int16_t> internalAudioBuffer;
  while (true) {
    {
      unique_lock lockAudio{mutAudio};
      cvAudio.wait(lockAudio, [&audioReady] { return audioReady; });
      if (sharedAudioBuffer.empty() && audioFinished) {
        break;
      }
      copy(sharedAudioBuffer.begin(), sharedAudioBuffer.end(),
           back_inserter(internalAudioBuffer));
      sharedAudioBuffer.clear();
      if (!audioFinished) {
        audioReady = false;
      }
    }
    int error =
        audio_object_write(my_audio, (const char *)internalAudioBuffer.data(),
                           sizeof(int16_t) * internalAudioBuffer.size());
    if (error != 0) {
      throw runtime_error(audio_object_strerror(my_audio, error));
    }
    audio_object_flush(my_audio);
    internalAudioBuffer.clear();
  }
 } // playProc
 #endif
 void printUsage(char *argv[]) {
  cerr << endl;
  cerr << "usage: " << argv[0] << " [options]" << endl;
--- a/src/cpp/model.hpp
+++ b/src/cpp/model.hpp
@ -1,53 +0,0 @@
 #ifndef MODEL_H_
 #define MODEL_H_
 #include <string>
 #include <onnxruntime_cxx_api.h>
 using namespace std;
 namespace piper {
 const string instanceName{"piper"};
 struct ModelSession {
  Ort::Session onnx;
  Ort::AllocatorWithDefaultOptions allocator;
  Ort::SessionOptions options;
  Ort::Env env;
  ModelSession() : onnx(nullptr){};
 };
 void loadModel(string modelPath, ModelSession &session) {
  session.env = Ort::Env(OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING,
                         instanceName.c_str());
  session.env.DisableTelemetryEvents();
  // Slows down performance by ~2x
  // session.options.SetIntraOpNumThreads(1);
  // Roughly doubles load time for no visible inference benefit
  // session.options.SetGraphOptimizationLevel(
  //     GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
  session.options.SetGraphOptimizationLevel(
      GraphOptimizationLevel::ORT_DISABLE_ALL);
  // Slows down performance very slightly
  // session.options.SetExecutionMode(ExecutionMode::ORT_PARALLEL);
  session.options.DisableCpuMemArena();
  session.options.DisableMemPattern();
  session.options.DisableProfiling();
  auto startTime = chrono::steady_clock::now();
  session.onnx = Ort::Session(session.env, filesystem::path(modelPath).c_str(), session.options);
  auto endTime = chrono::steady_clock::now();
  auto loadDuration = chrono::duration<double>(endTime - startTime);
 }
 } // namespace piper
 #endif // MODEL_H_
--- a/src/cpp/phonemize.hpp
+++ b/src/cpp/phonemize.hpp
@ -1,142 +0,0 @@
 #ifndef PHONEMIZE_H_
 #define PHONEMIZE_H_
 #include <filesystem>
 #include <iostream>
 #include <map>
 #include <optional>
 #include <set>
 #include <stdexcept>
 #include <string>
 #include <vector>
 #include <espeak-ng/speak_lib.h>
 #include "config.hpp"
 #include "utf8.h"
 #define CLAUSE_INTONATION_FULL_STOP   0x00000000
 #define CLAUSE_INTONATION_COMMA       0x00001000
 #define CLAUSE_INTONATION_QUESTION    0x00002000
 #define CLAUSE_INTONATION_EXCLAMATION 0x00003000
 #define CLAUSE_TYPE_SENTENCE          0x00080000
 using namespace std;
 namespace piper {
 // Text to phonemes using eSpeak-ng
 void phonemize(string text, PhonemizeConfig &phonemizeConfig,
               vector<vector<Phoneme>> &phonemes) {
  if (!phonemizeConfig.eSpeak) {
    throw runtime_error("Missing eSpeak config");
  }
  auto voice = phonemizeConfig.eSpeak->voice;
  int result = espeak_SetVoiceByName(voice.c_str());
  if (result != 0) {
    throw runtime_error("Failed to set eSpeak-ng voice");
  }
  // Modified by eSpeak
  string textCopy(text);
  utf8::iterator textIter(textCopy.begin(), textCopy.begin(), textCopy.end());
  utf8::iterator textIterEnd(textCopy.end(), textCopy.begin(), textCopy.end());
  vector<char32_t> textClauseBreakers;
  // Identify clause breakers in the sentence, since eSpeak removes them during
  // phonemization.
  //
  // This will unfortunately do the wrong thing with abbreviations, etc.
  while (textIter != textIterEnd) {
    auto codepoint = *textIter;
    if (phonemizeConfig.eSpeak->clauseBreakers.contains(codepoint)) {
      textClauseBreakers.push_back(codepoint);
    }
    textIter++;
  }
  vector<Phoneme> *sentencePhonemes = nullptr;
  const char *inputTextPointer = textCopy.c_str();
  int terminator = 0;
  while (inputTextPointer != NULL) {
    // Modified espeak-ng API to get access to clause terminator
    string clausePhonemes(
        espeak_TextToPhonemes2((const void **)&inputTextPointer,
                              /*textmode*/ espeakCHARS_AUTO,
                              /*phonememode = IPA*/ 0x02,
                               &terminator));
    utf8::iterator phonemeIter(clausePhonemes.begin(), clausePhonemes.begin(),
                               clausePhonemes.end());
    utf8::iterator phonemeEnd(clausePhonemes.end(), clausePhonemes.begin(),
                              clausePhonemes.end());
    if (!sentencePhonemes) {
      // Start new sentence
      phonemes.emplace_back();
      sentencePhonemes = &phonemes[phonemes.size() - 1];
    }
    sentencePhonemes->insert(sentencePhonemes->end(), phonemeIter, phonemeEnd);
    // Add appropriate puntuation depending on terminator type
    int intonation = terminator & 0x0000F000;
    if (intonation == CLAUSE_INTONATION_FULL_STOP) {
      sentencePhonemes->push_back(phonemizeConfig.eSpeak->fullStop);
    } else if (intonation == CLAUSE_INTONATION_COMMA) {
      sentencePhonemes->push_back(phonemizeConfig.eSpeak->comma);
    } else if (intonation == CLAUSE_INTONATION_QUESTION) {
      sentencePhonemes->push_back(phonemizeConfig.eSpeak->question);
    } else if (intonation == CLAUSE_INTONATION_EXCLAMATION) {
      sentencePhonemes->push_back(phonemizeConfig.eSpeak->exclamation);
    }
    if ((terminator & CLAUSE_TYPE_SENTENCE) == CLAUSE_TYPE_SENTENCE) {
        // End of sentence
        sentencePhonemes = nullptr;
    }
  }  // while inputTextPointer != NULL
 } /* phonemize */
 // Phonemes to ids using JSON map
 void phonemes2ids(vector<Phoneme> &phonemes, PhonemizeConfig &phonemizeConfig,
                  vector<PhonemeId> &phonemeIds) {
  if (phonemes.empty()) {
    throw runtime_error("No phonemes");
  }
  phonemeIds.push_back(phonemizeConfig.idBos);
  if (phonemizeConfig.interspersePad) {
    phonemeIds.push_back(phonemizeConfig.idPad);
  }
  for (auto phoneme = phonemes.begin(); phoneme != phonemes.end(); phoneme++) {
    if (phonemizeConfig.phonemeIdMap.contains(*phoneme)) {
      for (auto id : phonemizeConfig.phonemeIdMap[*phoneme]) {
        phonemeIds.push_back(id);
        if (phonemizeConfig.interspersePad) {
          phonemeIds.push_back(phonemizeConfig.idPad);
        }
      }
    } else {
      string phonemeStr;
      utf8::append(*phoneme, phonemeStr);
      cerr << "[WARN] No id for phoneme: " << phonemeStr << endl;
    }
  }
  phonemeIds.push_back(phonemizeConfig.idEos);
 } /* phonemes2ids */
 } // namespace piper
 #endif // PHONEMIZE_H_
--- a/src/cpp/piper.cpp
+++ b/src/cpp/piper.cpp
@ -0,0 +1,393 @@
 #include <array>
 #include <chrono>
 #include <fstream>
 #include <limits>
 #include <stdexcept>
 #include <espeak-ng/speak_lib.h>
 #include <onnxruntime_cxx_api.h>
 #include "piper.hpp"
 #include "utf8.h"
 #include "wavfile.hpp"
 namespace piper {
 // Maximum value for 16-bit signed WAV sample
 const float MAX_WAV_VALUE = 32767.0f;
 const std::string instanceName{"piper"};
 bool isSingleCodepoint(std::string s) {
  return utf8::distance(s.begin(), s.end()) == 1;
 }
 Phoneme getCodepoint(std::string s) {
  utf8::iterator character_iter(s.begin(), s.begin(), s.end());
  return *character_iter;
 }
 void parsePhonemizeConfig(json &configRoot, PhonemizeConfig &phonemizeConfig) {
  if (configRoot.contains("espeak")) {
    if (!phonemizeConfig.eSpeak) {
      phonemizeConfig.eSpeak.emplace();
    }
    auto espeakValue = configRoot["espeak"];
    if (espeakValue.contains("voice")) {
      phonemizeConfig.eSpeak->voice = espeakValue["voice"].get<std::string>();
    }
  }
  if (configRoot.contains("phoneme_type")) {
    auto phonemeTypeStr = configRoot["phoneme_type"].get<std::string>();
    if (phonemeTypeStr == "text") {
      phonemizeConfig.phonemeType = TextPhonemes;
    }
  }
  // phoneme to [phoneme] map
  if (configRoot.contains("phoneme_map")) {
    if (!phonemizeConfig.phonemeMap) {
      phonemizeConfig.phonemeMap.emplace();
    }
    auto phonemeMapValue = configRoot["phoneme_map"];
    for (auto &fromPhonemeItem : phonemeMapValue.items()) {
      std::string fromPhoneme = fromPhonemeItem.key();
      if (!isSingleCodepoint(fromPhoneme)) {
        throw std::runtime_error(
            "Phonemes must be one codepoint (phoneme map)");
      }
      auto fromCodepoint = getCodepoint(fromPhoneme);
      for (auto &toPhonemeValue : fromPhonemeItem.value()) {
        std::string toPhoneme = toPhonemeValue.get<std::string>();
        if (!isSingleCodepoint(toPhoneme)) {
          throw std::runtime_error(
              "Phonemes must be one codepoint (phoneme map)");
        }
        auto toCodepoint = getCodepoint(toPhoneme);
        (*phonemizeConfig.phonemeMap)[fromCodepoint].push_back(toCodepoint);
      }
    }
  }
  // phoneme to [id] map
  if (configRoot.contains("phoneme_id_map")) {
    auto phonemeIdMapValue = configRoot["phoneme_id_map"];
    for (auto &fromPhonemeItem : phonemeIdMapValue.items()) {
      std::string fromPhoneme = fromPhonemeItem.key();
      if (!isSingleCodepoint(fromPhoneme)) {
        throw std::runtime_error(
            "Phonemes must be one codepoint (phoneme id map)");
      }
      auto fromCodepoint = getCodepoint(fromPhoneme);
      for (auto &toIdValue : fromPhonemeItem.value()) {
        PhonemeId toId = toIdValue.get<PhonemeId>();
        phonemizeConfig.phonemeIdMap[fromCodepoint].push_back(toId);
      }
    }
  }
 } /* parsePhonemizeConfig */
 void parseSynthesisConfig(json &configRoot, SynthesisConfig &synthesisConfig) {
  if (configRoot.contains("audio")) {
    auto audioValue = configRoot["audio"];
    if (audioValue.contains("sample_rate")) {
      // Default sample rate is 22050 Hz
      synthesisConfig.sampleRate = audioValue.value("sample_rate", 22050);
    }
  }
 } /* parseSynthesisConfig */
 void parseModelConfig(json &configRoot, ModelConfig &modelConfig) {
  modelConfig.numSpeakers = configRoot["num_speakers"].get<SpeakerId>();
 } /* parseModelConfig */
 void initialize(PiperConfig &config) {
  if (config.useESpeak) {
    // Set up espeak-ng for calling espeak_TextToPhonemesWithTerminator
    // See: https://github.com/rhasspy/espeak-ng
    int result = espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS,
                                   /*buflength*/ 0,
                                   /*path*/ config.eSpeakDataPath.c_str(),
                                   /*options*/ 0);
    if (result < 0) {
      throw std::runtime_error("Failed to initialize eSpeak-ng");
    }
  }
 }
 void terminate(PiperConfig &config) {
  if (config.useESpeak) {
    // Clean up espeak-ng
    espeak_Terminate();
  }
 }
 void loadModel(std::string modelPath, ModelSession &session) {
  session.env = Ort::Env(OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING,
                         instanceName.c_str());
  session.env.DisableTelemetryEvents();
  // Slows down performance by ~2x
  // session.options.SetIntraOpNumThreads(1);
  // Roughly doubles load time for no visible inference benefit
  // session.options.SetGraphOptimizationLevel(
  //     GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
  session.options.SetGraphOptimizationLevel(
      GraphOptimizationLevel::ORT_DISABLE_ALL);
  // Slows down performance very slightly
  // session.options.SetExecutionMode(ExecutionMode::ORT_PARALLEL);
  session.options.DisableCpuMemArena();
  session.options.DisableMemPattern();
  session.options.DisableProfiling();
  auto startTime = std::chrono::steady_clock::now();
  session.onnx = Ort::Session(session.env, modelPath.c_str(), session.options);
  auto endTime = std::chrono::steady_clock::now();
  auto loadDuration = std::chrono::duration<double>(endTime - startTime);
 }
 // Load Onnx model and JSON config file
 void loadVoice(PiperConfig &config, std::string modelPath,
               std::string modelConfigPath, Voice &voice,
               std::optional<SpeakerId> &speakerId) {
  std::ifstream modelConfigFile(modelConfigPath);
  voice.configRoot = json::parse(modelConfigFile);
  parsePhonemizeConfig(voice.configRoot, voice.phonemizeConfig);
  parseSynthesisConfig(voice.configRoot, voice.synthesisConfig);
  parseModelConfig(voice.configRoot, voice.modelConfig);
  if (voice.modelConfig.numSpeakers > 1) {
    // Multi-speaker model
    if (speakerId) {
      voice.synthesisConfig.speakerId = speakerId;
    } else {
      // Default speaker
      voice.synthesisConfig.speakerId = 0;
    }
  }
  loadModel(modelPath, voice.session);
 } /* loadVoice */
 // Phoneme ids to WAV audio
 void synthesize(std::vector<PhonemeId> &phonemeIds,
                SynthesisConfig &synthesisConfig, ModelSession &session,
                std::vector<int16_t> &audioBuffer, SynthesisResult &result) {
  auto memoryInfo = Ort::MemoryInfo::CreateCpu(
      OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
  // Allocate
  std::vector<int64_t> phonemeIdLengths{(int64_t)phonemeIds.size()};
  std::vector<float> scales{synthesisConfig.noiseScale,
                            synthesisConfig.lengthScale,
                            synthesisConfig.noiseW};
  std::vector<Ort::Value> inputTensors;
  std::vector<int64_t> phonemeIdsShape{1, (int64_t)phonemeIds.size()};
  inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
      memoryInfo, phonemeIds.data(), phonemeIds.size(), phonemeIdsShape.data(),
      phonemeIdsShape.size()));
  std::vector<int64_t> phomemeIdLengthsShape{(int64_t)phonemeIdLengths.size()};
  inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
      memoryInfo, phonemeIdLengths.data(), phonemeIdLengths.size(),
      phomemeIdLengthsShape.data(), phomemeIdLengthsShape.size()));
  std::vector<int64_t> scalesShape{(int64_t)scales.size()};
  inputTensors.push_back(
      Ort::Value::CreateTensor<float>(memoryInfo, scales.data(), scales.size(),
                                      scalesShape.data(), scalesShape.size()));
  // Add speaker id.
  // NOTE: These must be kept outside the "if" below to avoid being deallocated.
  std::vector<int64_t> speakerId{
      (int64_t)synthesisConfig.speakerId.value_or(0)};
  std::vector<int64_t> speakerIdShape{(int64_t)speakerId.size()};
  if (synthesisConfig.speakerId) {
    inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
        memoryInfo, speakerId.data(), speakerId.size(), speakerIdShape.data(),
        speakerIdShape.size()));
  }
  // From export_onnx.py
  std::array<const char *, 4> inputNames = {"input", "input_lengths", "scales",
                                            "sid"};
  std::array<const char *, 1> outputNames = {"output"};
  // Infer
  auto startTime = std::chrono::steady_clock::now();
  auto outputTensors = session.onnx.Run(
      Ort::RunOptions{nullptr}, inputNames.data(), inputTensors.data(),
      inputTensors.size(), outputNames.data(), outputNames.size());
  auto endTime = std::chrono::steady_clock::now();
  if ((outputTensors.size() != 1) || (!outputTensors.front().IsTensor())) {
    throw std::runtime_error("Invalid output tensors");
  }
  auto inferDuration = std::chrono::duration<double>(endTime - startTime);
  result.inferSeconds = inferDuration.count();
  const float *audio = outputTensors.front().GetTensorData<float>();
  auto audioShape =
      outputTensors.front().GetTensorTypeAndShapeInfo().GetShape();
  int64_t audioCount = audioShape[audioShape.size() - 1];
  result.audioSeconds = (double)audioCount / (double)synthesisConfig.sampleRate;
  result.realTimeFactor = 0.0;
  if (result.audioSeconds > 0) {
    result.realTimeFactor = result.inferSeconds / result.audioSeconds;
  }
  // Get max audio value for scaling
  float maxAudioValue = 0.01f;
  for (int64_t i = 0; i < audioCount; i++) {
    float audioValue = abs(audio[i]);
    if (audioValue > maxAudioValue) {
      maxAudioValue = audioValue;
    }
  }
  // We know the size up front
  audioBuffer.reserve(audioCount);
  // Scale audio to fill range and convert to int16
  float audioScale = (MAX_WAV_VALUE / std::max(0.01f, maxAudioValue));
  for (int64_t i = 0; i < audioCount; i++) {
    int16_t intAudioValue = static_cast<int16_t>(
        std::clamp(audio[i] * audioScale,
                   static_cast<float>(std::numeric_limits<int16_t>::min()),
                   static_cast<float>(std::numeric_limits<int16_t>::max())));
    audioBuffer.push_back(intAudioValue);
  }
  // Clean up
  for (std::size_t i = 0; i < outputTensors.size(); i++) {
    Ort::detail::OrtRelease(outputTensors[i].release());
  }
  for (std::size_t i = 0; i < inputTensors.size(); i++) {
    Ort::detail::OrtRelease(inputTensors[i].release());
  }
 }
 // ----------------------------------------------------------------------------
 // Phonemize text and synthesize audio
 void textToAudio(PiperConfig &config, Voice &voice, std::string text,
                 std::vector<int16_t> &audioBuffer, SynthesisResult &result,
                 const std::function<void()> &audioCallback) {
  std::size_t sentenceSilenceSamples = 0;
  if (voice.synthesisConfig.sentenceSilenceSeconds > 0) {
    sentenceSilenceSamples = (std::size_t)(
        voice.synthesisConfig.sentenceSilenceSeconds *
        voice.synthesisConfig.sampleRate * voice.synthesisConfig.channels);
  }
  // Phonemes for each sentence
  std::vector<std::vector<Phoneme>> phonemes;
  if (voice.phonemizeConfig.phonemeType == eSpeakPhonemes) {
    // Use espeak-ng for phonemization
    eSpeakPhonemeConfig eSpeakConfig;
    eSpeakConfig.voice = voice.phonemizeConfig.eSpeak->voice;
    phonemize_eSpeak(text, eSpeakConfig, phonemes);
  } else {
    // Use UTF-8 codepoints as "phonemes"
    CodepointsPhonemeConfig codepointsConfig;
    phonemize_codepoints(text, codepointsConfig, phonemes);
  }
  // Synthesize each sentence independently.
  std::vector<PhonemeId> phonemeIds;
  std::map<Phoneme, std::size_t> missingPhonemes;
  for (auto phonemesIter = phonemes.begin(); phonemesIter != phonemes.end();
       ++phonemesIter) {
    std::vector<Phoneme> &sentencePhonemes = *phonemesIter;
    SynthesisResult sentenceResult;
    PhonemeIdConfig idConfig;
    if (voice.phonemizeConfig.phonemeType == TextPhonemes) {
      auto &language = voice.phonemizeConfig.eSpeak->voice;
      if (DEFAULT_ALPHABET.count(language) < 1) {
        throw std::runtime_error(
            "Text phoneme language for voice is not supported");
      }
      // Use alphabet for language
      idConfig.phonemeIdMap =
          std::make_shared<PhonemeIdMap>(DEFAULT_ALPHABET[language]);
    }
    // phonemes -> ids
    phonemes_to_ids(sentencePhonemes, idConfig, phonemeIds, missingPhonemes);
    // ids -> audio
    synthesize(phonemeIds, voice.synthesisConfig, voice.session, audioBuffer,
               sentenceResult);
    // Add end of sentence silence
    if (sentenceSilenceSamples > 0) {
      for (std::size_t i = 0; i < sentenceSilenceSamples; i++) {
        audioBuffer.push_back(0);
      }
    }
    if (audioCallback) {
      // Call back must copy audio since it is cleared afterwards.
      audioCallback();
      audioBuffer.clear();
    }
    result.audioSeconds += sentenceResult.audioSeconds;
    result.inferSeconds += sentenceResult.inferSeconds;
    phonemeIds.clear();
  }
  if (result.audioSeconds > 0) {
    result.realTimeFactor = result.inferSeconds / result.audioSeconds;
  }
 } /* textToAudio */
 // Phonemize text and synthesize audio to WAV file
 void textToWavFile(PiperConfig &config, Voice &voice, std::string text,
                   std::ostream &audioFile, SynthesisResult &result) {
  std::vector<int16_t> audioBuffer;
  textToAudio(config, voice, text, audioBuffer, result, NULL);
  // Write WAV
  auto synthesisConfig = voice.synthesisConfig;
  writeWavHeader(synthesisConfig.sampleRate, synthesisConfig.sampleWidth,
                 synthesisConfig.channels, (int32_t)audioBuffer.size(),
                 audioFile);
  audioFile.write((const char *)audioBuffer.data(),
                  sizeof(int16_t) * audioBuffer.size());
 } /* textToWavFile */
 } // namespace piper
--- a/src/cpp/piper.hpp
+++ b/src/cpp/piper.hpp
@ -1,148 +1,105 @@
 #ifndef PIPER_H_
 #define PIPER_H_
-#include <filesystem>
+#include <functional>
-#include <iostream>
+#include <fstream>
 #include <optional>
 #include <string>
 #include <vector>
-#include "json.hpp"
+#include <onnxruntime_cxx_api.h>
-#include <espeak-ng/speak_lib.h>
+#include <phoneme_ids.hpp>
 #include <phonemize.hpp>
-#include "config.hpp"
+#include "json.hpp"
 #include "model.hpp"
 #include "phonemize.hpp"
 #include "synthesize.hpp"
 #include "wavfile.hpp"
 using json = nlohmann::json;
 namespace piper {
-struct Voice {
+typedef int64_t SpeakerId;
-  json configRoot;
+
-  PhonemizeConfig phonemizeConfig;
+struct eSpeakConfig {
-  SynthesisConfig synthesisConfig;
+  std::string voice = "en-us";
  ModelConfig modelConfig;
  ModelSession session;
 };
-void initialize(std::filesystem::path cwd) {
+struct PiperConfig {
-  string dataPath;
+  std::string eSpeakDataPath;
  bool useESpeak = true;
 };
-  auto cwdDataPath = std::filesystem::absolute(cwd.append("espeak-ng-data"));
+enum PhonemeType { eSpeakPhonemes, TextPhonemes };
  if (std::filesystem::is_directory(cwdDataPath)) {
    dataPath = cwdDataPath.string();
  }
-	cerr << "dataPath: " << dataPath << endl;
+struct PhonemizeConfig {
  PhonemeType phonemeType = eSpeakPhonemes;
  std::optional<std::map<Phoneme, std::vector<Phoneme>>> phonemeMap;
  std::map<Phoneme, std::vector<PhonemeId>> phonemeIdMap;
-  // Set up espeak-ng for calling espeak_TextToPhonemes
+  PhonemeId idPad = 0; // padding (optionally interspersed)
-  int result = espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS,
+  PhonemeId idBos = 1; // beginning of sentence
-                                 /*buflength*/ 0,
+  PhonemeId idEos = 2; // end of sentence
-                                 /*path*/ dataPath.c_str(),
+  bool interspersePad = true;
                                 /*options*/ 0);
  if (result < 0) {
    throw runtime_error("Failed to initialize eSpeak-ng");
  }
 }
-void terminate() {
+  std::optional<eSpeakConfig> eSpeak;
-  // Clean up espeak-ng
+};
  espeak_Terminate();
 }
-// Load Onnx model and JSON config file
+struct SynthesisConfig {
-void loadVoice(string modelPath, string modelConfigPath, Voice &voice,
+  float noiseScale = 0.667f;
-               optional<SpeakerId> &speakerId) {
+  float lengthScale = 1.0f;
-  ifstream modelConfigFile(modelConfigPath.c_str());
+  float noiseW = 0.8f;
-  voice.configRoot = json::parse(modelConfigFile);
+  int sampleRate = 22050;
  int sampleWidth = 2; // 16-bit
  int channels = 1;    // mono
  std::optional<SpeakerId> speakerId;
  float sentenceSilenceSeconds = 0.2f;
 };
-  parsePhonemizeConfig(voice.configRoot, voice.phonemizeConfig);
+struct ModelConfig {
-  parseSynthesisConfig(voice.configRoot, voice.synthesisConfig);
+  int numSpeakers;
-  parseModelConfig(voice.configRoot, voice.modelConfig);
+};
-  if (voice.modelConfig.numSpeakers > 1) {
+struct ModelSession {
-    // Multispeaker model
+  Ort::Session onnx;
-    if (speakerId) {
+  Ort::AllocatorWithDefaultOptions allocator;
-      voice.synthesisConfig.speakerId = speakerId;
+  Ort::SessionOptions options;
-    } else {
+  Ort::Env env;
      // Default speaker
      voice.synthesisConfig.speakerId = 0;
    }
  }
-  loadModel(modelPath, voice.session);
+  ModelSession() : onnx(nullptr){};
 };
-} /* loadVoice */
+struct SynthesisResult {
  double inferSeconds;
  double audioSeconds;
  double realTimeFactor;
 };
-// Phonemize text and synthesize audio
+struct Voice {
-void textToAudio(Voice &voice, string text, vector<int16_t> &audioBuffer,
+  json configRoot;
-                 SynthesisResult &result,
+  PhonemizeConfig phonemizeConfig;
-                 const function<void()> &audioCallback) {
+  SynthesisConfig synthesisConfig;
-
+  ModelConfig modelConfig;
-  size_t sentenceSilenceSamples = 0;
+  ModelSession session;
-  if (voice.synthesisConfig.sentenceSilenceSeconds > 0) {
+};
    sentenceSilenceSamples = (size_t)(
        voice.synthesisConfig.sentenceSilenceSeconds *
        voice.synthesisConfig.sampleRate * voice.synthesisConfig.channels);
  }
  // Phonemes for each sentence
  vector<vector<Phoneme>> phonemes;
  phonemize(text, voice.phonemizeConfig, phonemes);
  vector<PhonemeId> phonemeIds;
  for (auto phonemesIter = phonemes.begin(); phonemesIter != phonemes.end();
       ++phonemesIter) {
    vector<Phoneme> &sentencePhonemes = *phonemesIter;
    SynthesisResult sentenceResult;
    phonemes2ids(sentencePhonemes, voice.phonemizeConfig, phonemeIds);
    synthesize(phonemeIds, voice.synthesisConfig, voice.session, audioBuffer,
               sentenceResult);
    // Add end of sentence silence
    if (sentenceSilenceSamples > 0) {
      for (size_t i = 0; i < sentenceSilenceSamples; i++) {
        audioBuffer.push_back(0);
      }
    }
    if (audioCallback) {
      // Call back must copy audio since it is cleared afterwards.
      audioCallback();
      audioBuffer.clear();
    }
    result.audioSeconds += sentenceResult.audioSeconds;
    result.inferSeconds += sentenceResult.inferSeconds;
    phonemeIds.clear();
  }
  if (result.audioSeconds > 0) {
    result.realTimeFactor = result.inferSeconds / result.audioSeconds;
  }
 } /* textToAudio */
-// Phonemize text and synthesize audio to WAV file
+// Must be called before using textTo* functions
-void textToWavFile(Voice &voice, string text, ostream &audioFile,
+void initialize(PiperConfig &config);
                   SynthesisResult &result) {
-  vector<int16_t> audioBuffer;
+// Clean up
-  textToAudio(voice, text, audioBuffer, result, NULL);
+void terminate(PiperConfig &config);
-  // Write WAV
+// Load Onnx model and JSON config file
-  auto synthesisConfig = voice.synthesisConfig;
+void loadVoice(PiperConfig &config, std::string modelPath,
-  writeWavHeader(synthesisConfig.sampleRate, synthesisConfig.sampleWidth,
+               std::string modelConfigPath, Voice &voice,
-                 synthesisConfig.channels, (int32_t)audioBuffer.size(),
+               std::optional<SpeakerId> &speakerId);
                 audioFile);
-  audioFile.write((const char *)audioBuffer.data(),
+// Phonemize text and synthesize audio
-                  sizeof(int16_t) * audioBuffer.size());
+void textToAudio(PiperConfig &config, Voice &voice, std::string text,
                 std::vector<int16_t> &audioBuffer, SynthesisResult &result,
                 const std::function<void()> &audioCallback);
-} /* textToWavFile */
+// Phonemize text and synthesize audio to WAV file
 void textToWavFile(PiperConfig &config, Voice &voice, std::string text,
                   std::ostream &audioFile, SynthesisResult &result);
 } // namespace piper
--- a/src/cpp/synthesize.hpp
+++ b/src/cpp/synthesize.hpp
@ -1,130 +0,0 @@
 #ifndef SYNTHESIZE_H_
 #define SYNTHESIZE_H_
 #include <array>
 #include <chrono>
 #include <limits>
 #include <memory>
 #include <vector>
 #include <onnxruntime_cxx_api.h>
 #include "config.hpp"
 #include "model.hpp"
 using namespace std;
 namespace piper {
 // Maximum value for 16-bit signed WAV sample
 const float MAX_WAV_VALUE = 32767.0f;
 struct SynthesisResult {
  double inferSeconds;
  double audioSeconds;
  double realTimeFactor;
 };
 // Phoneme ids to WAV audio
 void synthesize(vector<PhonemeId> &phonemeIds, SynthesisConfig &synthesisConfig,
                ModelSession &session, vector<int16_t> &audioBuffer,
                SynthesisResult &result) {
  auto memoryInfo = Ort::MemoryInfo::CreateCpu(
      OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
  // Allocate
  vector<int64_t> phonemeIdLengths{(int64_t)phonemeIds.size()};
  vector<float> scales{synthesisConfig.noiseScale, synthesisConfig.lengthScale,
                       synthesisConfig.noiseW};
  vector<Ort::Value> inputTensors;
  vector<int64_t> phonemeIdsShape{1, (int64_t)phonemeIds.size()};
  inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
      memoryInfo, phonemeIds.data(), phonemeIds.size(), phonemeIdsShape.data(),
      phonemeIdsShape.size()));
  vector<int64_t> phomemeIdLengthsShape{(int64_t)phonemeIdLengths.size()};
  inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
      memoryInfo, phonemeIdLengths.data(), phonemeIdLengths.size(),
      phomemeIdLengthsShape.data(), phomemeIdLengthsShape.size()));
  vector<int64_t> scalesShape{(int64_t)scales.size()};
  inputTensors.push_back(
      Ort::Value::CreateTensor<float>(memoryInfo, scales.data(), scales.size(),
                                      scalesShape.data(), scalesShape.size()));
  // Add speaker id.
  // NOTE: These must be kept outside the "if" below to avoid being deallocated.
  vector<int64_t> speakerId{(int64_t)synthesisConfig.speakerId.value_or(0)};
  vector<int64_t> speakerIdShape{(int64_t)speakerId.size()};
  if (synthesisConfig.speakerId) {
    inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
        memoryInfo, speakerId.data(), speakerId.size(), speakerIdShape.data(),
        speakerIdShape.size()));
  }
  // From export_onnx.py
  array<const char *, 4> inputNames = {"input", "input_lengths", "scales",
                                       "sid"};
  array<const char *, 1> outputNames = {"output"};
  // Infer
  auto startTime = chrono::steady_clock::now();
  auto outputTensors = session.onnx.Run(
      Ort::RunOptions{nullptr}, inputNames.data(), inputTensors.data(),
      inputTensors.size(), outputNames.data(), outputNames.size());
  auto endTime = chrono::steady_clock::now();
  if ((outputTensors.size() != 1) || (!outputTensors.front().IsTensor())) {
    throw runtime_error("Invalid output tensors");
  }
  auto inferDuration = chrono::duration<double>(endTime - startTime);
  result.inferSeconds = inferDuration.count();
  const float *audio = outputTensors.front().GetTensorData<float>();
  auto audioShape =
      outputTensors.front().GetTensorTypeAndShapeInfo().GetShape();
  int64_t audioCount = audioShape[audioShape.size() - 1];
  result.audioSeconds = (double)audioCount / (double)synthesisConfig.sampleRate;
  result.realTimeFactor = 0.0;
  if (result.audioSeconds > 0) {
    result.realTimeFactor = result.inferSeconds / result.audioSeconds;
  }
  // Get max audio value for scaling
  float maxAudioValue = 0.01f;
  for (int64_t i = 0; i < audioCount; i++) {
    float audioValue = abs(audio[i]);
    if (audioValue > maxAudioValue) {
      maxAudioValue = audioValue;
    }
  }
  // We know the size up front
  audioBuffer.reserve(audioCount);
  // Scale audio to fill range and convert to int16
  float audioScale = (MAX_WAV_VALUE / max(0.01f, maxAudioValue));
  for (int64_t i = 0; i < audioCount; i++) {
    int16_t intAudioValue = static_cast<int16_t>(
        clamp(audio[i] * audioScale,
              static_cast<float>(numeric_limits<int16_t>::min()),
              static_cast<float>(numeric_limits<int16_t>::max())));
    audioBuffer.push_back(intAudioValue);
  }
  // Clean up
  for (size_t i = 0; i < outputTensors.size(); i++) {
    Ort::detail::OrtRelease(outputTensors[i].release());
  }
  for (size_t i = 0; i < inputTensors.size(); i++) {
    Ort::detail::OrtRelease(inputTensors[i].release());
  }
 }
 } // namespace piper
 #endif // SYNTHESIZE_H_
--- a/src/cpp/wavfile.hpp
+++ b/src/cpp/wavfile.hpp
@ -3,8 +3,6 @@
 #include <iostream>
 namespace piper {
 struct WavHeader {
  uint8_t RIFF[4] = {'R', 'I', 'F', 'F'};
  uint32_t chunkSize;
@ -14,7 +12,7 @@ struct WavHeader {
  uint8_t fmt[4] = {'f', 'm', 't', ' '};
  uint32_t fmtSize = 16;    // bytes
  uint16_t audioFormat = 1; // PCM
-  uint16_t numChannels; // mono
+  uint16_t numChannels;     // mono
  uint32_t sampleRate;      // Hertz
  uint32_t bytesPerSec;     // sampleRate * sampleWidth
  uint16_t blockAlign = 2;  // 16-bit mono
@ -39,6 +37,4 @@ void writeWavHeader(int sampleRate, int sampleWidth, int channels,
 } /* writeWavHeader */
 } // namespace piper
 #endif // WAVFILE_H_