First working version with libpiper_phonemize

pull/100/head
Michael Hansen 12 months ago
parent 7d27863b48
commit 810fad44cf

@ -1,16 +1,12 @@
.PHONY: release debug clean test
.PHONY: piper clean test
release:
mkdir -p build
cd build && cmake ../src/cpp -DCMAKE_BUILD_TYPE=Release && make
LIB_DIR := lib/Linux-$(shell uname -m)
no-pcaudio:
piper:
mkdir -p build
cd build && cmake ../src/cpp -DCMAKE_BUILD_TYPE=Release -DUSE_PCAUDIO=OFF && make
debug:
mkdir -p build
cd build && cmake ../src/cpp -DCMAKE_BUILD_TYPE=Debug && make
cd build && cmake ../src/cpp -DCMAKE_BUILD_TYPE=Release && make
cp -aR $(LIB_DIR)/piper_phonemize/espeak-ng-data $(LIB_DIR)/piper_phonemize/lib/*.so* build/
cp -a $(LIB_DIR)/onnxruntime/lib/*.so* build/
clean:
rm -rf build/ dist/

Binary file not shown.

@ -4,47 +4,31 @@ include(CheckIncludeFileCXX)
project(piper C CXX)
set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
ADD_EXECUTABLE(piper main.cpp)
ADD_EXECUTABLE(piper main.cpp piper.cpp)
string(APPEND CMAKE_CXX_FLAGS " -Wall -Wextra -Wl,-rpath,'$ORIGIN'")
string(APPEND CMAKE_C_FLAGS " -Wall -Wextra")
find_package(PkgConfig)
pkg_check_modules(ESPEAK_NG REQUIRED espeak-ng<2)
# https://github.com/espeak-ng/pcaudiolib
check_include_file_cxx("pcaudiolib/audio.h" PCAUDIO_INCLUDE_FOUND)
if(PCAUDIO_INCLUDE_FOUND)
option(USE_PCAUDIO "Build with pcaudiolib" ON)
if(USE_PCAUDIO)
target_compile_definitions(piper PUBLIC HAVE_PCAUDIO)
set(PCAUDIO_LIBRARIES "pcaudio")
endif()
endif()
set(ONNXRUNTIME_ROOTDIR ${CMAKE_CURRENT_LIST_DIR}/../../lib/${CMAKE_HOST_SYSTEM_NAME}-${CMAKE_HOST_SYSTEM_PROCESSOR})
set(PIPER_PHONEMIZE_ROOTDIR ${CMAKE_CURRENT_LIST_DIR}/../../lib/${CMAKE_HOST_SYSTEM_NAME}-${CMAKE_HOST_SYSTEM_PROCESSOR}/piper_phonemize)
set(ONNXRUNTIME_ROOTDIR ${CMAKE_CURRENT_LIST_DIR}/../../lib/${CMAKE_HOST_SYSTEM_NAME}-${CMAKE_HOST_SYSTEM_PROCESSOR}/onnxruntime)
target_link_libraries(piper
piper_phonemize
espeak-ng
onnxruntime
pthread
${ESPEAK_NG_LIBRARIES}
${PCAUDIO_LIBRARIES})
pthread)
if(NOT APPLE)
target_link_libraries(-static-libgcc -static-libstdc++)
endif()
target_link_directories(piper PUBLIC
${ESPEAK_NG_LIBRARY_DIRS}
${PIPER_PHONEMIZE_ROOTDIR}/lib
${ONNXRUNTIME_ROOTDIR}/lib)
target_include_directories(piper PUBLIC
${ONNXRUNTIME_ROOTDIR}/include
${ESPEAK_NG_INCLUDE_DIRS})
target_compile_options(piper PUBLIC
${ESPEAK_NG_CFLAGS_OTHER})
${PIPER_PHONEMIZE_ROOTDIR}/include
${ONNXRUNTIME_ROOTDIR}/include)

@ -1,155 +0,0 @@
#ifndef CONFIG_H_
#define CONFIG_H_
#include <filesystem>
#include <map>
#include <optional>
#include <set>
#include <stdexcept>
#include <string>
#include <vector>
#include "json.hpp"
#include "utf8.h"
using namespace std;
using json = nlohmann::json;
namespace piper {
typedef char32_t Phoneme;
typedef int64_t PhonemeId;
typedef int64_t SpeakerId;
const string DefaultVoice = "en-us";
enum eSpeakMode { Text, TextWithPhonemes, SSML };
struct eSpeakConfig {
string voice = DefaultVoice;
eSpeakMode mode = Text;
// Characters that eSpeak uses to break apart paragraphs/sentences
set<Phoneme> clauseBreakers{U'.', U'?', U'!', U',', U';', U':'};
Phoneme fullStop = U'.';
Phoneme comma = U',';
Phoneme question = U'?';
Phoneme exclamation = U'!';
};
struct PhonemizeConfig {
optional<map<Phoneme, vector<Phoneme>>> phonemeMap;
map<Phoneme, vector<PhonemeId>> phonemeIdMap;
PhonemeId idPad = 0; // padding (optionally interspersed)
PhonemeId idBos = 1; // beginning of sentence
PhonemeId idEos = 2; // end of sentence
bool interspersePad = true;
optional<eSpeakConfig> eSpeak;
};
struct SynthesisConfig {
float noiseScale = 0.667f;
float lengthScale = 1.0f;
float noiseW = 0.8f;
int sampleRate = 22050;
int sampleWidth = 2; // 16-bit
int channels = 1; // mono
optional<SpeakerId> speakerId;
float sentenceSilenceSeconds = 0.2f;
};
struct ModelConfig {
int numSpeakers;
};
bool isSingleCodepoint(string s) {
return utf8::distance(s.begin(), s.end()) == 1;
}
Phoneme getCodepoint(string s) {
utf8::iterator character_iter(s.begin(), s.begin(), s.end());
return *character_iter;
}
void parsePhonemizeConfig(json &configRoot, PhonemizeConfig &phonemizeConfig) {
if (configRoot.contains("espeak")) {
if (!phonemizeConfig.eSpeak) {
phonemizeConfig.eSpeak.emplace();
}
auto espeakValue = configRoot["espeak"];
if (espeakValue.contains("voice")) {
phonemizeConfig.eSpeak->voice = espeakValue["voice"].get<string>();
}
}
// phoneme to [phoneme] map
if (configRoot.contains("phoneme_map")) {
if (!phonemizeConfig.phonemeMap) {
phonemizeConfig.phonemeMap.emplace();
}
auto phonemeMapValue = configRoot["phoneme_map"];
for (auto &fromPhonemeItem : phonemeMapValue.items()) {
string fromPhoneme = fromPhonemeItem.key();
if (!isSingleCodepoint(fromPhoneme)) {
throw runtime_error("Phonemes must be one codepoint (phoneme map)");
}
auto fromCodepoint = getCodepoint(fromPhoneme);
for (auto &toPhonemeValue : fromPhonemeItem.value()) {
string toPhoneme = toPhonemeValue.get<string>();
if (!isSingleCodepoint(toPhoneme)) {
throw runtime_error("Phonemes must be one codepoint (phoneme map)");
}
auto toCodepoint = getCodepoint(toPhoneme);
(*phonemizeConfig.phonemeMap)[fromCodepoint].push_back(toCodepoint);
}
}
}
// phoneme to [id] map
if (configRoot.contains("phoneme_id_map")) {
auto phonemeIdMapValue = configRoot["phoneme_id_map"];
for (auto &fromPhonemeItem : phonemeIdMapValue.items()) {
string fromPhoneme = fromPhonemeItem.key();
if (!isSingleCodepoint(fromPhoneme)) {
throw runtime_error("Phonemes must be one codepoint (phoneme id map)");
}
auto fromCodepoint = getCodepoint(fromPhoneme);
for (auto &toIdValue : fromPhonemeItem.value()) {
PhonemeId toId = toIdValue.get<PhonemeId>();
phonemizeConfig.phonemeIdMap[fromCodepoint].push_back(toId);
}
}
}
} /* parsePhonemizeConfig */
void parseSynthesisConfig(json &configRoot, SynthesisConfig &synthesisConfig) {
if (configRoot.contains("audio")) {
auto audioValue = configRoot["audio"];
if (audioValue.contains("sample_rate")) {
// Default sample rate is 22050 Hz
synthesisConfig.sampleRate = audioValue.value("sample_rate", 22050);
}
}
} /* parseSynthesisConfig */
void parseModelConfig(json &configRoot, ModelConfig &modelConfig) {
modelConfig.numSpeakers = configRoot["num_speakers"].get<SpeakerId>();
} /* parseModelConfig */
} // namespace piper
#endif // CONFIG_H_

@ -2,6 +2,7 @@
#include <condition_variable>
#include <filesystem>
#include <fstream>
#include <functional>
#include <iostream>
#include <mutex>
#include <sstream>
@ -10,11 +11,6 @@
#include <thread>
#include <vector>
#ifdef HAVE_PCAUDIO
// https://github.com/espeak-ng/pcaudiolib
#include <pcaudiolib/audio.h>
#endif
#ifdef _MSC_VER
#define WIN32_LEAN_AND_MEAN
#define NOMINMAX
@ -29,19 +25,13 @@
using namespace std;
enum OutputType {
OUTPUT_FILE,
OUTPUT_DIRECTORY,
OUTPUT_STDOUT,
OUTPUT_PLAY,
OUTPUT_RAW
};
enum OutputType { OUTPUT_FILE, OUTPUT_DIRECTORY, OUTPUT_STDOUT, OUTPUT_RAW };
struct RunConfig {
filesystem::path modelPath;
filesystem::path modelConfigPath;
OutputType outputType = OUTPUT_PLAY;
optional<filesystem::path> outputPath;
OutputType outputType = OUTPUT_DIRECTORY;
optional<filesystem::path> outputPath = filesystem::path(".");
optional<piper::SpeakerId> speakerId;
optional<float> noiseScale;
optional<float> lengthScale;
@ -53,12 +43,6 @@ void rawOutputProc(vector<int16_t> &sharedAudioBuffer, mutex &mutAudio,
condition_variable &cvAudio, bool &audioReady,
bool &audioFinished);
#ifdef HAVE_PCAUDIO
void playProc(audio_object *my_audio, vector<int16_t> &sharedAudioBuffer,
mutex &mutAudio, condition_variable &cvAudio, bool &audioReady,
bool &audioFinished);
#endif
int main(int argc, char *argv[]) {
RunConfig runConfig;
parseArgs(argc, argv, runConfig);
@ -66,7 +50,7 @@ int main(int argc, char *argv[]) {
// NOTE: This won't work for Windows (need GetModuleFileName)
#ifdef _MSC_VER
auto exePath = []() {
wchar_t moduleFileName[MAX_PATH] = { 0 };
wchar_t moduleFileName[MAX_PATH] = {0};
GetModuleFileNameW(nullptr, moduleFileName, std::size(moduleFileName));
return filesystem::path(moduleFileName);
}();
@ -81,17 +65,22 @@ int main(int argc, char *argv[]) {
#else
auto exePath = filesystem::canonical("/proc/self/exe");
#endif
#endif
piper::initialize(exePath.parent_path());
piper::PiperConfig piperConfig;
piperConfig.eSpeakDataPath =
std::filesystem::absolute(exePath.parent_path().append("espeak-ng-data"))
.string();
piper::Voice voice;
auto startTime = chrono::steady_clock::now();
loadVoice(runConfig.modelPath.string(), runConfig.modelConfigPath.string(),
voice, runConfig.speakerId);
loadVoice(piperConfig, runConfig.modelPath.string(),
runConfig.modelConfigPath.string(), voice, runConfig.speakerId);
auto endTime = chrono::steady_clock::now();
auto loadSeconds = chrono::duration<double>(endTime - startTime).count();
cerr << "Load time: " << loadSeconds << " sec" << endl;
piper::initialize(piperConfig);
// Scales
if (runConfig.noiseScale) {
voice.synthesisConfig.noiseScale = runConfig.noiseScale.value();
@ -105,33 +94,6 @@ int main(int argc, char *argv[]) {
voice.synthesisConfig.noiseW = runConfig.noiseW.value();
}
#ifdef HAVE_PCAUDIO
audio_object *my_audio = nullptr;
if (runConfig.outputType == OUTPUT_PLAY) {
// Output audio to the default audio device
my_audio = create_audio_device_object(NULL, "piper", "Text-to-Speech");
// TODO: Support 32-bit sample widths
auto audioFormat = AUDIO_OBJECT_FORMAT_S16LE;
int error = audio_object_open(my_audio, audioFormat,
voice.synthesisConfig.sampleRate,
voice.synthesisConfig.channels);
if (error != 0) {
throw runtime_error(audio_object_strerror(my_audio, error));
}
}
#else
if (runConfig.outputType == OUTPUT_PLAY) {
// Cannot play audio directly
cerr << "WARNING: Piper was not compiled with pcaudiolib. Output audio "
"will be written to the current directory."
<< endl;
runConfig.outputType = OUTPUT_DIRECTORY;
runConfig.outputPath = filesystem::path(".");
}
#endif
if (runConfig.outputType == OUTPUT_DIRECTORY) {
runConfig.outputPath = filesystem::absolute(runConfig.outputPath.value());
cerr << "Output directory: " << runConfig.outputPath.value() << endl;
@ -155,7 +117,7 @@ int main(int argc, char *argv[]) {
// Output audio to automatically-named WAV file in a directory
ofstream audioFile(outputPath.string(), ios::binary);
piper::textToWavFile(voice, line, audioFile, result);
piper::textToWavFile(piperConfig, voice, line, audioFile, result);
cout << outputPath.string() << endl;
} else if (runConfig.outputType == OUTPUT_FILE) {
// Read all of standard input before synthesizing.
@ -168,10 +130,10 @@ int main(int argc, char *argv[]) {
// Output audio to WAV file
ofstream audioFile(runConfig.outputPath.value().string(), ios::binary);
piper::textToWavFile(voice, text.str(), audioFile, result);
piper::textToWavFile(piperConfig, voice, text.str(), audioFile, result);
} else if (runConfig.outputType == OUTPUT_STDOUT) {
// Output WAV to stdout
piper::textToWavFile(voice, line, cout, result);
piper::textToWavFile(piperConfig, voice, line, cout, result);
} else if (runConfig.outputType == OUTPUT_RAW) {
// Raw output to stdout
mutex mutAudio;
@ -195,7 +157,8 @@ int main(int argc, char *argv[]) {
cvAudio.notify_one();
}
};
piper::textToAudio(voice, line, audioBuffer, result, audioCallback);
piper::textToAudio(piperConfig, voice, line, audioBuffer, result,
audioCallback);
// Signal thread that there is no more audio
{
@ -208,45 +171,6 @@ int main(int argc, char *argv[]) {
// Wait for audio output to finish
cerr << "Waiting for audio..." << endl;
rawOutputThread.join();
} else if (runConfig.outputType == OUTPUT_PLAY) {
#ifdef HAVE_PCAUDIO
mutex mutAudio;
condition_variable cvAudio;
bool audioReady = false;
bool audioFinished = false;
vector<int16_t> audioBuffer;
vector<int16_t> sharedAudioBuffer;
thread playThread(playProc, my_audio, ref(sharedAudioBuffer),
ref(mutAudio), ref(cvAudio), ref(audioReady),
ref(audioFinished));
auto audioCallback = [&audioBuffer, &sharedAudioBuffer, &mutAudio,
&cvAudio, &audioReady]() {
// Signal thread that audio is ready
{
unique_lock lockAudio(mutAudio);
copy(audioBuffer.begin(), audioBuffer.end(),
back_inserter(sharedAudioBuffer));
audioReady = true;
cvAudio.notify_one();
}
};
piper::textToAudio(voice, line, audioBuffer, result, audioCallback);
// Signal thread that there is no more audio
{
unique_lock lockAudio(mutAudio);
audioReady = true;
audioFinished = true;
cvAudio.notify_one();
}
// Wait for audio output to finish
cerr << "Waiting for audio..." << endl;
playThread.join();
#else
throw runtime_error("Cannot play audio! Not compiled with pcaudiolib.");
#endif
}
cerr << "Real-time factor: " << result.realTimeFactor
@ -254,13 +178,7 @@ int main(int argc, char *argv[]) {
<< " sec, audio=" << result.audioSeconds << " sec)" << endl;
}
piper::terminate();
#ifdef HAVE_PCAUDIO
audio_object_close(my_audio);
audio_object_destroy(my_audio);
my_audio = nullptr;
#endif
piper::terminate(piperConfig);
return EXIT_SUCCESS;
}
@ -296,43 +214,6 @@ void rawOutputProc(vector<int16_t> &sharedAudioBuffer, mutex &mutAudio,
} // rawOutputProc
#ifdef HAVE_PCAUDIO
void playProc(audio_object *my_audio, vector<int16_t> &sharedAudioBuffer,
mutex &mutAudio, condition_variable &cvAudio, bool &audioReady,
bool &audioFinished) {
vector<int16_t> internalAudioBuffer;
while (true) {
{
unique_lock lockAudio{mutAudio};
cvAudio.wait(lockAudio, [&audioReady] { return audioReady; });
if (sharedAudioBuffer.empty() && audioFinished) {
break;
}
copy(sharedAudioBuffer.begin(), sharedAudioBuffer.end(),
back_inserter(internalAudioBuffer));
sharedAudioBuffer.clear();
if (!audioFinished) {
audioReady = false;
}
}
int error =
audio_object_write(my_audio, (const char *)internalAudioBuffer.data(),
sizeof(int16_t) * internalAudioBuffer.size());
if (error != 0) {
throw runtime_error(audio_object_strerror(my_audio, error));
}
audio_object_flush(my_audio);
internalAudioBuffer.clear();
}
} // playProc
#endif
void printUsage(char *argv[]) {
cerr << endl;
cerr << "usage: " << argv[0] << " [options]" << endl;

@ -1,53 +0,0 @@
#ifndef MODEL_H_
#define MODEL_H_
#include <string>
#include <onnxruntime_cxx_api.h>
using namespace std;
namespace piper {
const string instanceName{"piper"};
struct ModelSession {
Ort::Session onnx;
Ort::AllocatorWithDefaultOptions allocator;
Ort::SessionOptions options;
Ort::Env env;
ModelSession() : onnx(nullptr){};
};
void loadModel(string modelPath, ModelSession &session) {
session.env = Ort::Env(OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING,
instanceName.c_str());
session.env.DisableTelemetryEvents();
// Slows down performance by ~2x
// session.options.SetIntraOpNumThreads(1);
// Roughly doubles load time for no visible inference benefit
// session.options.SetGraphOptimizationLevel(
// GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
session.options.SetGraphOptimizationLevel(
GraphOptimizationLevel::ORT_DISABLE_ALL);
// Slows down performance very slightly
// session.options.SetExecutionMode(ExecutionMode::ORT_PARALLEL);
session.options.DisableCpuMemArena();
session.options.DisableMemPattern();
session.options.DisableProfiling();
auto startTime = chrono::steady_clock::now();
session.onnx = Ort::Session(session.env, filesystem::path(modelPath).c_str(), session.options);
auto endTime = chrono::steady_clock::now();
auto loadDuration = chrono::duration<double>(endTime - startTime);
}
} // namespace piper
#endif // MODEL_H_

@ -1,142 +0,0 @@
#ifndef PHONEMIZE_H_
#define PHONEMIZE_H_
#include <filesystem>
#include <iostream>
#include <map>
#include <optional>
#include <set>
#include <stdexcept>
#include <string>
#include <vector>
#include <espeak-ng/speak_lib.h>
#include "config.hpp"
#include "utf8.h"
#define CLAUSE_INTONATION_FULL_STOP 0x00000000
#define CLAUSE_INTONATION_COMMA 0x00001000
#define CLAUSE_INTONATION_QUESTION 0x00002000
#define CLAUSE_INTONATION_EXCLAMATION 0x00003000
#define CLAUSE_TYPE_SENTENCE 0x00080000
using namespace std;
namespace piper {
// Text to phonemes using eSpeak-ng
void phonemize(string text, PhonemizeConfig &phonemizeConfig,
vector<vector<Phoneme>> &phonemes) {
if (!phonemizeConfig.eSpeak) {
throw runtime_error("Missing eSpeak config");
}
auto voice = phonemizeConfig.eSpeak->voice;
int result = espeak_SetVoiceByName(voice.c_str());
if (result != 0) {
throw runtime_error("Failed to set eSpeak-ng voice");
}
// Modified by eSpeak
string textCopy(text);
utf8::iterator textIter(textCopy.begin(), textCopy.begin(), textCopy.end());
utf8::iterator textIterEnd(textCopy.end(), textCopy.begin(), textCopy.end());
vector<char32_t> textClauseBreakers;
// Identify clause breakers in the sentence, since eSpeak removes them during
// phonemization.
//
// This will unfortunately do the wrong thing with abbreviations, etc.
while (textIter != textIterEnd) {
auto codepoint = *textIter;
if (phonemizeConfig.eSpeak->clauseBreakers.contains(codepoint)) {
textClauseBreakers.push_back(codepoint);
}
textIter++;
}
vector<Phoneme> *sentencePhonemes = nullptr;
const char *inputTextPointer = textCopy.c_str();
int terminator = 0;
while (inputTextPointer != NULL) {
// Modified espeak-ng API to get access to clause terminator
string clausePhonemes(
espeak_TextToPhonemes2((const void **)&inputTextPointer,
/*textmode*/ espeakCHARS_AUTO,
/*phonememode = IPA*/ 0x02,
&terminator));
utf8::iterator phonemeIter(clausePhonemes.begin(), clausePhonemes.begin(),
clausePhonemes.end());
utf8::iterator phonemeEnd(clausePhonemes.end(), clausePhonemes.begin(),
clausePhonemes.end());
if (!sentencePhonemes) {
// Start new sentence
phonemes.emplace_back();
sentencePhonemes = &phonemes[phonemes.size() - 1];
}
sentencePhonemes->insert(sentencePhonemes->end(), phonemeIter, phonemeEnd);
// Add appropriate puntuation depending on terminator type
int intonation = terminator & 0x0000F000;
if (intonation == CLAUSE_INTONATION_FULL_STOP) {
sentencePhonemes->push_back(phonemizeConfig.eSpeak->fullStop);
} else if (intonation == CLAUSE_INTONATION_COMMA) {
sentencePhonemes->push_back(phonemizeConfig.eSpeak->comma);
} else if (intonation == CLAUSE_INTONATION_QUESTION) {
sentencePhonemes->push_back(phonemizeConfig.eSpeak->question);
} else if (intonation == CLAUSE_INTONATION_EXCLAMATION) {
sentencePhonemes->push_back(phonemizeConfig.eSpeak->exclamation);
}
if ((terminator & CLAUSE_TYPE_SENTENCE) == CLAUSE_TYPE_SENTENCE) {
// End of sentence
sentencePhonemes = nullptr;
}
} // while inputTextPointer != NULL
} /* phonemize */
// Phonemes to ids using JSON map
void phonemes2ids(vector<Phoneme> &phonemes, PhonemizeConfig &phonemizeConfig,
vector<PhonemeId> &phonemeIds) {
if (phonemes.empty()) {
throw runtime_error("No phonemes");
}
phonemeIds.push_back(phonemizeConfig.idBos);
if (phonemizeConfig.interspersePad) {
phonemeIds.push_back(phonemizeConfig.idPad);
}
for (auto phoneme = phonemes.begin(); phoneme != phonemes.end(); phoneme++) {
if (phonemizeConfig.phonemeIdMap.contains(*phoneme)) {
for (auto id : phonemizeConfig.phonemeIdMap[*phoneme]) {
phonemeIds.push_back(id);
if (phonemizeConfig.interspersePad) {
phonemeIds.push_back(phonemizeConfig.idPad);
}
}
} else {
string phonemeStr;
utf8::append(*phoneme, phonemeStr);
cerr << "[WARN] No id for phoneme: " << phonemeStr << endl;
}
}
phonemeIds.push_back(phonemizeConfig.idEos);
} /* phonemes2ids */
} // namespace piper
#endif // PHONEMIZE_H_

@ -0,0 +1,393 @@
#include <array>
#include <chrono>
#include <fstream>
#include <limits>
#include <stdexcept>
#include <espeak-ng/speak_lib.h>
#include <onnxruntime_cxx_api.h>
#include "piper.hpp"
#include "utf8.h"
#include "wavfile.hpp"
namespace piper {
// Maximum value for 16-bit signed WAV sample
const float MAX_WAV_VALUE = 32767.0f;
const std::string instanceName{"piper"};
bool isSingleCodepoint(std::string s) {
return utf8::distance(s.begin(), s.end()) == 1;
}
Phoneme getCodepoint(std::string s) {
utf8::iterator character_iter(s.begin(), s.begin(), s.end());
return *character_iter;
}
void parsePhonemizeConfig(json &configRoot, PhonemizeConfig &phonemizeConfig) {
if (configRoot.contains("espeak")) {
if (!phonemizeConfig.eSpeak) {
phonemizeConfig.eSpeak.emplace();
}
auto espeakValue = configRoot["espeak"];
if (espeakValue.contains("voice")) {
phonemizeConfig.eSpeak->voice = espeakValue["voice"].get<std::string>();
}
}
if (configRoot.contains("phoneme_type")) {
auto phonemeTypeStr = configRoot["phoneme_type"].get<std::string>();
if (phonemeTypeStr == "text") {
phonemizeConfig.phonemeType = TextPhonemes;
}
}
// phoneme to [phoneme] map
if (configRoot.contains("phoneme_map")) {
if (!phonemizeConfig.phonemeMap) {
phonemizeConfig.phonemeMap.emplace();
}
auto phonemeMapValue = configRoot["phoneme_map"];
for (auto &fromPhonemeItem : phonemeMapValue.items()) {
std::string fromPhoneme = fromPhonemeItem.key();
if (!isSingleCodepoint(fromPhoneme)) {
throw std::runtime_error(
"Phonemes must be one codepoint (phoneme map)");
}
auto fromCodepoint = getCodepoint(fromPhoneme);
for (auto &toPhonemeValue : fromPhonemeItem.value()) {
std::string toPhoneme = toPhonemeValue.get<std::string>();
if (!isSingleCodepoint(toPhoneme)) {
throw std::runtime_error(
"Phonemes must be one codepoint (phoneme map)");
}
auto toCodepoint = getCodepoint(toPhoneme);
(*phonemizeConfig.phonemeMap)[fromCodepoint].push_back(toCodepoint);
}
}
}
// phoneme to [id] map
if (configRoot.contains("phoneme_id_map")) {
auto phonemeIdMapValue = configRoot["phoneme_id_map"];
for (auto &fromPhonemeItem : phonemeIdMapValue.items()) {
std::string fromPhoneme = fromPhonemeItem.key();
if (!isSingleCodepoint(fromPhoneme)) {
throw std::runtime_error(
"Phonemes must be one codepoint (phoneme id map)");
}
auto fromCodepoint = getCodepoint(fromPhoneme);
for (auto &toIdValue : fromPhonemeItem.value()) {
PhonemeId toId = toIdValue.get<PhonemeId>();
phonemizeConfig.phonemeIdMap[fromCodepoint].push_back(toId);
}
}
}
} /* parsePhonemizeConfig */
void parseSynthesisConfig(json &configRoot, SynthesisConfig &synthesisConfig) {
if (configRoot.contains("audio")) {
auto audioValue = configRoot["audio"];
if (audioValue.contains("sample_rate")) {
// Default sample rate is 22050 Hz
synthesisConfig.sampleRate = audioValue.value("sample_rate", 22050);
}
}
} /* parseSynthesisConfig */
void parseModelConfig(json &configRoot, ModelConfig &modelConfig) {
modelConfig.numSpeakers = configRoot["num_speakers"].get<SpeakerId>();
} /* parseModelConfig */
void initialize(PiperConfig &config) {
if (config.useESpeak) {
// Set up espeak-ng for calling espeak_TextToPhonemesWithTerminator
// See: https://github.com/rhasspy/espeak-ng
int result = espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS,
/*buflength*/ 0,
/*path*/ config.eSpeakDataPath.c_str(),
/*options*/ 0);
if (result < 0) {
throw std::runtime_error("Failed to initialize eSpeak-ng");
}
}
}
void terminate(PiperConfig &config) {
if (config.useESpeak) {
// Clean up espeak-ng
espeak_Terminate();
}
}
void loadModel(std::string modelPath, ModelSession &session) {
session.env = Ort::Env(OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING,
instanceName.c_str());
session.env.DisableTelemetryEvents();
// Slows down performance by ~2x
// session.options.SetIntraOpNumThreads(1);
// Roughly doubles load time for no visible inference benefit
// session.options.SetGraphOptimizationLevel(
// GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
session.options.SetGraphOptimizationLevel(
GraphOptimizationLevel::ORT_DISABLE_ALL);
// Slows down performance very slightly
// session.options.SetExecutionMode(ExecutionMode::ORT_PARALLEL);
session.options.DisableCpuMemArena();
session.options.DisableMemPattern();
session.options.DisableProfiling();
auto startTime = std::chrono::steady_clock::now();
session.onnx = Ort::Session(session.env, modelPath.c_str(), session.options);
auto endTime = std::chrono::steady_clock::now();
auto loadDuration = std::chrono::duration<double>(endTime - startTime);
}
// Load Onnx model and JSON config file
void loadVoice(PiperConfig &config, std::string modelPath,
std::string modelConfigPath, Voice &voice,
std::optional<SpeakerId> &speakerId) {
std::ifstream modelConfigFile(modelConfigPath);
voice.configRoot = json::parse(modelConfigFile);
parsePhonemizeConfig(voice.configRoot, voice.phonemizeConfig);
parseSynthesisConfig(voice.configRoot, voice.synthesisConfig);
parseModelConfig(voice.configRoot, voice.modelConfig);
if (voice.modelConfig.numSpeakers > 1) {
// Multi-speaker model
if (speakerId) {
voice.synthesisConfig.speakerId = speakerId;
} else {
// Default speaker
voice.synthesisConfig.speakerId = 0;
}
}
loadModel(modelPath, voice.session);
} /* loadVoice */
// Phoneme ids to WAV audio
void synthesize(std::vector<PhonemeId> &phonemeIds,
SynthesisConfig &synthesisConfig, ModelSession &session,
std::vector<int16_t> &audioBuffer, SynthesisResult &result) {
auto memoryInfo = Ort::MemoryInfo::CreateCpu(
OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
// Allocate
std::vector<int64_t> phonemeIdLengths{(int64_t)phonemeIds.size()};
std::vector<float> scales{synthesisConfig.noiseScale,
synthesisConfig.lengthScale,
synthesisConfig.noiseW};
std::vector<Ort::Value> inputTensors;
std::vector<int64_t> phonemeIdsShape{1, (int64_t)phonemeIds.size()};
inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
memoryInfo, phonemeIds.data(), phonemeIds.size(), phonemeIdsShape.data(),
phonemeIdsShape.size()));
std::vector<int64_t> phomemeIdLengthsShape{(int64_t)phonemeIdLengths.size()};
inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
memoryInfo, phonemeIdLengths.data(), phonemeIdLengths.size(),
phomemeIdLengthsShape.data(), phomemeIdLengthsShape.size()));
std::vector<int64_t> scalesShape{(int64_t)scales.size()};
inputTensors.push_back(
Ort::Value::CreateTensor<float>(memoryInfo, scales.data(), scales.size(),
scalesShape.data(), scalesShape.size()));
// Add speaker id.
// NOTE: These must be kept outside the "if" below to avoid being deallocated.
std::vector<int64_t> speakerId{
(int64_t)synthesisConfig.speakerId.value_or(0)};
std::vector<int64_t> speakerIdShape{(int64_t)speakerId.size()};
if (synthesisConfig.speakerId) {
inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
memoryInfo, speakerId.data(), speakerId.size(), speakerIdShape.data(),
speakerIdShape.size()));
}
// From export_onnx.py
std::array<const char *, 4> inputNames = {"input", "input_lengths", "scales",
"sid"};
std::array<const char *, 1> outputNames = {"output"};
// Infer
auto startTime = std::chrono::steady_clock::now();
auto outputTensors = session.onnx.Run(
Ort::RunOptions{nullptr}, inputNames.data(), inputTensors.data(),
inputTensors.size(), outputNames.data(), outputNames.size());
auto endTime = std::chrono::steady_clock::now();
if ((outputTensors.size() != 1) || (!outputTensors.front().IsTensor())) {
throw std::runtime_error("Invalid output tensors");
}
auto inferDuration = std::chrono::duration<double>(endTime - startTime);
result.inferSeconds = inferDuration.count();
const float *audio = outputTensors.front().GetTensorData<float>();
auto audioShape =
outputTensors.front().GetTensorTypeAndShapeInfo().GetShape();
int64_t audioCount = audioShape[audioShape.size() - 1];
result.audioSeconds = (double)audioCount / (double)synthesisConfig.sampleRate;
result.realTimeFactor = 0.0;
if (result.audioSeconds > 0) {
result.realTimeFactor = result.inferSeconds / result.audioSeconds;
}
// Get max audio value for scaling
float maxAudioValue = 0.01f;
for (int64_t i = 0; i < audioCount; i++) {
float audioValue = abs(audio[i]);
if (audioValue > maxAudioValue) {
maxAudioValue = audioValue;
}
}
// We know the size up front
audioBuffer.reserve(audioCount);
// Scale audio to fill range and convert to int16
float audioScale = (MAX_WAV_VALUE / std::max(0.01f, maxAudioValue));
for (int64_t i = 0; i < audioCount; i++) {
int16_t intAudioValue = static_cast<int16_t>(
std::clamp(audio[i] * audioScale,
static_cast<float>(std::numeric_limits<int16_t>::min()),
static_cast<float>(std::numeric_limits<int16_t>::max())));
audioBuffer.push_back(intAudioValue);
}
// Clean up
for (std::size_t i = 0; i < outputTensors.size(); i++) {
Ort::detail::OrtRelease(outputTensors[i].release());
}
for (std::size_t i = 0; i < inputTensors.size(); i++) {
Ort::detail::OrtRelease(inputTensors[i].release());
}
}
// ----------------------------------------------------------------------------
// Phonemize text and synthesize audio
void textToAudio(PiperConfig &config, Voice &voice, std::string text,
std::vector<int16_t> &audioBuffer, SynthesisResult &result,
const std::function<void()> &audioCallback) {
std::size_t sentenceSilenceSamples = 0;
if (voice.synthesisConfig.sentenceSilenceSeconds > 0) {
sentenceSilenceSamples = (std::size_t)(
voice.synthesisConfig.sentenceSilenceSeconds *
voice.synthesisConfig.sampleRate * voice.synthesisConfig.channels);
}
// Phonemes for each sentence
std::vector<std::vector<Phoneme>> phonemes;
if (voice.phonemizeConfig.phonemeType == eSpeakPhonemes) {
// Use espeak-ng for phonemization
eSpeakPhonemeConfig eSpeakConfig;
eSpeakConfig.voice = voice.phonemizeConfig.eSpeak->voice;
phonemize_eSpeak(text, eSpeakConfig, phonemes);
} else {
// Use UTF-8 codepoints as "phonemes"
CodepointsPhonemeConfig codepointsConfig;
phonemize_codepoints(text, codepointsConfig, phonemes);
}
// Synthesize each sentence independently.
std::vector<PhonemeId> phonemeIds;
std::map<Phoneme, std::size_t> missingPhonemes;
for (auto phonemesIter = phonemes.begin(); phonemesIter != phonemes.end();
++phonemesIter) {
std::vector<Phoneme> &sentencePhonemes = *phonemesIter;
SynthesisResult sentenceResult;
PhonemeIdConfig idConfig;
if (voice.phonemizeConfig.phonemeType == TextPhonemes) {
auto &language = voice.phonemizeConfig.eSpeak->voice;
if (DEFAULT_ALPHABET.count(language) < 1) {
throw std::runtime_error(
"Text phoneme language for voice is not supported");
}
// Use alphabet for language
idConfig.phonemeIdMap =
std::make_shared<PhonemeIdMap>(DEFAULT_ALPHABET[language]);
}
// phonemes -> ids
phonemes_to_ids(sentencePhonemes, idConfig, phonemeIds, missingPhonemes);
// ids -> audio
synthesize(phonemeIds, voice.synthesisConfig, voice.session, audioBuffer,
sentenceResult);
// Add end of sentence silence
if (sentenceSilenceSamples > 0) {
for (std::size_t i = 0; i < sentenceSilenceSamples; i++) {
audioBuffer.push_back(0);
}
}
if (audioCallback) {
// Call back must copy audio since it is cleared afterwards.
audioCallback();
audioBuffer.clear();
}
result.audioSeconds += sentenceResult.audioSeconds;
result.inferSeconds += sentenceResult.inferSeconds;
phonemeIds.clear();
}
if (result.audioSeconds > 0) {
result.realTimeFactor = result.inferSeconds / result.audioSeconds;
}
} /* textToAudio */
// Phonemize text and synthesize audio to WAV file
void textToWavFile(PiperConfig &config, Voice &voice, std::string text,
std::ostream &audioFile, SynthesisResult &result) {
std::vector<int16_t> audioBuffer;
textToAudio(config, voice, text, audioBuffer, result, NULL);
// Write WAV
auto synthesisConfig = voice.synthesisConfig;
writeWavHeader(synthesisConfig.sampleRate, synthesisConfig.sampleWidth,
synthesisConfig.channels, (int32_t)audioBuffer.size(),
audioFile);
audioFile.write((const char *)audioBuffer.data(),
sizeof(int16_t) * audioBuffer.size());
} /* textToWavFile */
} // namespace piper

@ -1,148 +1,105 @@
#ifndef PIPER_H_
#define PIPER_H_
#include <filesystem>
#include <iostream>
#include <functional>
#include <fstream>
#include <optional>
#include <string>
#include <vector>
#include "json.hpp"
#include <espeak-ng/speak_lib.h>
#include <onnxruntime_cxx_api.h>
#include <phoneme_ids.hpp>
#include <phonemize.hpp>
#include "config.hpp"
#include "model.hpp"
#include "phonemize.hpp"
#include "synthesize.hpp"
#include "wavfile.hpp"
#include "json.hpp"
using json = nlohmann::json;
namespace piper {
struct Voice {
json configRoot;
PhonemizeConfig phonemizeConfig;
SynthesisConfig synthesisConfig;
ModelConfig modelConfig;
ModelSession session;
typedef int64_t SpeakerId;
struct eSpeakConfig {
std::string voice = "en-us";
};
void initialize(std::filesystem::path cwd) {
string dataPath;
struct PiperConfig {
std::string eSpeakDataPath;
bool useESpeak = true;
};
auto cwdDataPath = std::filesystem::absolute(cwd.append("espeak-ng-data"));
if (std::filesystem::is_directory(cwdDataPath)) {
dataPath = cwdDataPath.string();
}
enum PhonemeType { eSpeakPhonemes, TextPhonemes };
cerr << "dataPath: " << dataPath << endl;
struct PhonemizeConfig {
PhonemeType phonemeType = eSpeakPhonemes;
std::optional<std::map<Phoneme, std::vector<Phoneme>>> phonemeMap;
std::map<Phoneme, std::vector<PhonemeId>> phonemeIdMap;
// Set up espeak-ng for calling espeak_TextToPhonemes
int result = espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS,
/*buflength*/ 0,
/*path*/ dataPath.c_str(),
/*options*/ 0);
if (result < 0) {
throw runtime_error("Failed to initialize eSpeak-ng");
}
}
PhonemeId idPad = 0; // padding (optionally interspersed)
PhonemeId idBos = 1; // beginning of sentence
PhonemeId idEos = 2; // end of sentence
bool interspersePad = true;
void terminate() {
// Clean up espeak-ng
espeak_Terminate();
}
std::optional<eSpeakConfig> eSpeak;
};
// Load Onnx model and JSON config file
void loadVoice(string modelPath, string modelConfigPath, Voice &voice,
optional<SpeakerId> &speakerId) {
ifstream modelConfigFile(modelConfigPath.c_str());
voice.configRoot = json::parse(modelConfigFile);
struct SynthesisConfig {
float noiseScale = 0.667f;
float lengthScale = 1.0f;
float noiseW = 0.8f;
int sampleRate = 22050;
int sampleWidth = 2; // 16-bit
int channels = 1; // mono
std::optional<SpeakerId> speakerId;
float sentenceSilenceSeconds = 0.2f;
};
parsePhonemizeConfig(voice.configRoot, voice.phonemizeConfig);
parseSynthesisConfig(voice.configRoot, voice.synthesisConfig);
parseModelConfig(voice.configRoot, voice.modelConfig);
struct ModelConfig {
int numSpeakers;
};
if (voice.modelConfig.numSpeakers > 1) {
// Multispeaker model
if (speakerId) {
voice.synthesisConfig.speakerId = speakerId;
} else {
// Default speaker
voice.synthesisConfig.speakerId = 0;
}
}
struct ModelSession {
Ort::Session onnx;
Ort::AllocatorWithDefaultOptions allocator;
Ort::SessionOptions options;
Ort::Env env;
loadModel(modelPath, voice.session);
ModelSession() : onnx(nullptr){};
};
} /* loadVoice */
struct SynthesisResult {
double inferSeconds;
double audioSeconds;
double realTimeFactor;
};
// Phonemize text and synthesize audio
void textToAudio(Voice &voice, string text, vector<int16_t> &audioBuffer,
SynthesisResult &result,
const function<void()> &audioCallback) {
size_t sentenceSilenceSamples = 0;
if (voice.synthesisConfig.sentenceSilenceSeconds > 0) {
sentenceSilenceSamples = (size_t)(
voice.synthesisConfig.sentenceSilenceSeconds *
voice.synthesisConfig.sampleRate * voice.synthesisConfig.channels);
}
// Phonemes for each sentence
vector<vector<Phoneme>> phonemes;
phonemize(text, voice.phonemizeConfig, phonemes);
vector<PhonemeId> phonemeIds;
for (auto phonemesIter = phonemes.begin(); phonemesIter != phonemes.end();
++phonemesIter) {
vector<Phoneme> &sentencePhonemes = *phonemesIter;
SynthesisResult sentenceResult;
phonemes2ids(sentencePhonemes, voice.phonemizeConfig, phonemeIds);
synthesize(phonemeIds, voice.synthesisConfig, voice.session, audioBuffer,
sentenceResult);
// Add end of sentence silence
if (sentenceSilenceSamples > 0) {
for (size_t i = 0; i < sentenceSilenceSamples; i++) {
audioBuffer.push_back(0);
}
}
if (audioCallback) {
// Call back must copy audio since it is cleared afterwards.
audioCallback();
audioBuffer.clear();
}
result.audioSeconds += sentenceResult.audioSeconds;
result.inferSeconds += sentenceResult.inferSeconds;
phonemeIds.clear();
}
if (result.audioSeconds > 0) {
result.realTimeFactor = result.inferSeconds / result.audioSeconds;
}
} /* textToAudio */
struct Voice {
json configRoot;
PhonemizeConfig phonemizeConfig;
SynthesisConfig synthesisConfig;
ModelConfig modelConfig;
ModelSession session;
};
// Phonemize text and synthesize audio to WAV file
void textToWavFile(Voice &voice, string text, ostream &audioFile,
SynthesisResult &result) {
// Must be called before using textTo* functions
void initialize(PiperConfig &config);
vector<int16_t> audioBuffer;
textToAudio(voice, text, audioBuffer, result, NULL);
// Clean up
void terminate(PiperConfig &config);
// Write WAV
auto synthesisConfig = voice.synthesisConfig;
writeWavHeader(synthesisConfig.sampleRate, synthesisConfig.sampleWidth,
synthesisConfig.channels, (int32_t)audioBuffer.size(),
audioFile);
// Load Onnx model and JSON config file
void loadVoice(PiperConfig &config, std::string modelPath,
std::string modelConfigPath, Voice &voice,
std::optional<SpeakerId> &speakerId);
audioFile.write((const char *)audioBuffer.data(),
sizeof(int16_t) * audioBuffer.size());
// Phonemize text and synthesize audio
void textToAudio(PiperConfig &config, Voice &voice, std::string text,
std::vector<int16_t> &audioBuffer, SynthesisResult &result,
const std::function<void()> &audioCallback);
} /* textToWavFile */
// Phonemize text and synthesize audio to WAV file
void textToWavFile(PiperConfig &config, Voice &voice, std::string text,
std::ostream &audioFile, SynthesisResult &result);
} // namespace piper

@ -1,130 +0,0 @@
#ifndef SYNTHESIZE_H_
#define SYNTHESIZE_H_
#include <array>
#include <chrono>
#include <limits>
#include <memory>
#include <vector>
#include <onnxruntime_cxx_api.h>
#include "config.hpp"
#include "model.hpp"
using namespace std;
namespace piper {
// Maximum value for 16-bit signed WAV sample
const float MAX_WAV_VALUE = 32767.0f;
struct SynthesisResult {
double inferSeconds;
double audioSeconds;
double realTimeFactor;
};
// Phoneme ids to WAV audio
void synthesize(vector<PhonemeId> &phonemeIds, SynthesisConfig &synthesisConfig,
ModelSession &session, vector<int16_t> &audioBuffer,
SynthesisResult &result) {
auto memoryInfo = Ort::MemoryInfo::CreateCpu(
OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
// Allocate
vector<int64_t> phonemeIdLengths{(int64_t)phonemeIds.size()};
vector<float> scales{synthesisConfig.noiseScale, synthesisConfig.lengthScale,
synthesisConfig.noiseW};
vector<Ort::Value> inputTensors;
vector<int64_t> phonemeIdsShape{1, (int64_t)phonemeIds.size()};
inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
memoryInfo, phonemeIds.data(), phonemeIds.size(), phonemeIdsShape.data(),
phonemeIdsShape.size()));
vector<int64_t> phomemeIdLengthsShape{(int64_t)phonemeIdLengths.size()};
inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
memoryInfo, phonemeIdLengths.data(), phonemeIdLengths.size(),
phomemeIdLengthsShape.data(), phomemeIdLengthsShape.size()));
vector<int64_t> scalesShape{(int64_t)scales.size()};
inputTensors.push_back(
Ort::Value::CreateTensor<float>(memoryInfo, scales.data(), scales.size(),
scalesShape.data(), scalesShape.size()));
// Add speaker id.
// NOTE: These must be kept outside the "if" below to avoid being deallocated.
vector<int64_t> speakerId{(int64_t)synthesisConfig.speakerId.value_or(0)};
vector<int64_t> speakerIdShape{(int64_t)speakerId.size()};
if (synthesisConfig.speakerId) {
inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
memoryInfo, speakerId.data(), speakerId.size(), speakerIdShape.data(),
speakerIdShape.size()));
}
// From export_onnx.py
array<const char *, 4> inputNames = {"input", "input_lengths", "scales",
"sid"};
array<const char *, 1> outputNames = {"output"};
// Infer
auto startTime = chrono::steady_clock::now();
auto outputTensors = session.onnx.Run(
Ort::RunOptions{nullptr}, inputNames.data(), inputTensors.data(),
inputTensors.size(), outputNames.data(), outputNames.size());
auto endTime = chrono::steady_clock::now();
if ((outputTensors.size() != 1) || (!outputTensors.front().IsTensor())) {
throw runtime_error("Invalid output tensors");
}
auto inferDuration = chrono::duration<double>(endTime - startTime);
result.inferSeconds = inferDuration.count();
const float *audio = outputTensors.front().GetTensorData<float>();
auto audioShape =
outputTensors.front().GetTensorTypeAndShapeInfo().GetShape();
int64_t audioCount = audioShape[audioShape.size() - 1];
result.audioSeconds = (double)audioCount / (double)synthesisConfig.sampleRate;
result.realTimeFactor = 0.0;
if (result.audioSeconds > 0) {
result.realTimeFactor = result.inferSeconds / result.audioSeconds;
}
// Get max audio value for scaling
float maxAudioValue = 0.01f;
for (int64_t i = 0; i < audioCount; i++) {
float audioValue = abs(audio[i]);
if (audioValue > maxAudioValue) {
maxAudioValue = audioValue;
}
}
// We know the size up front
audioBuffer.reserve(audioCount);
// Scale audio to fill range and convert to int16
float audioScale = (MAX_WAV_VALUE / max(0.01f, maxAudioValue));
for (int64_t i = 0; i < audioCount; i++) {
int16_t intAudioValue = static_cast<int16_t>(
clamp(audio[i] * audioScale,
static_cast<float>(numeric_limits<int16_t>::min()),
static_cast<float>(numeric_limits<int16_t>::max())));
audioBuffer.push_back(intAudioValue);
}
// Clean up
for (size_t i = 0; i < outputTensors.size(); i++) {
Ort::detail::OrtRelease(outputTensors[i].release());
}
for (size_t i = 0; i < inputTensors.size(); i++) {
Ort::detail::OrtRelease(inputTensors[i].release());
}
}
} // namespace piper
#endif // SYNTHESIZE_H_

@ -3,8 +3,6 @@
#include <iostream>
namespace piper {
struct WavHeader {
uint8_t RIFF[4] = {'R', 'I', 'F', 'F'};
uint32_t chunkSize;
@ -14,7 +12,7 @@ struct WavHeader {
uint8_t fmt[4] = {'f', 'm', 't', ' '};
uint32_t fmtSize = 16; // bytes
uint16_t audioFormat = 1; // PCM
uint16_t numChannels; // mono
uint16_t numChannels; // mono
uint32_t sampleRate; // Hertz
uint32_t bytesPerSec; // sampleRate * sampleWidth
uint16_t blockAlign = 2; // 16-bit mono
@ -39,6 +37,4 @@ void writeWavHeader(int sampleRate, int sampleWidth, int channels,
} /* writeWavHeader */
} // namespace piper
#endif // WAVFILE_H_

Loading…
Cancel
Save