You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
piper/src/cpp/synthesize.hpp

131 lines
4.3 KiB
C++

#ifndef SYNTHESIZE_H_
#define SYNTHESIZE_H_
#include <array>
#include <chrono>
#include <limits>
#include <memory>
#include <vector>
#include <onnxruntime_cxx_api.h>
#include "config.hpp"
#include "model.hpp"
using namespace std;
namespace piper {
// Maximum value for 16-bit signed WAV sample
const float MAX_WAV_VALUE = 32767.0f;
struct SynthesisResult {
double inferSeconds;
double audioSeconds;
double realTimeFactor;
};
// Phoneme ids to WAV audio
void synthesize(vector<PhonemeId> &phonemeIds, SynthesisConfig &synthesisConfig,
ModelSession &session, vector<int16_t> &audioBuffer,
SynthesisResult &result) {
auto memoryInfo = Ort::MemoryInfo::CreateCpu(
OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
// Allocate
vector<int64_t> phonemeIdLengths{(int64_t)phonemeIds.size()};
vector<float> scales{synthesisConfig.noiseScale, synthesisConfig.lengthScale,
synthesisConfig.noiseW};
vector<Ort::Value> inputTensors;
vector<int64_t> phonemeIdsShape{1, (int64_t)phonemeIds.size()};
inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
memoryInfo, phonemeIds.data(), phonemeIds.size(), phonemeIdsShape.data(),
phonemeIdsShape.size()));
vector<int64_t> phomemeIdLengthsShape{(int64_t)phonemeIdLengths.size()};
inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
memoryInfo, phonemeIdLengths.data(), phonemeIdLengths.size(),
phomemeIdLengthsShape.data(), phomemeIdLengthsShape.size()));
vector<int64_t> scalesShape{(int64_t)scales.size()};
inputTensors.push_back(
Ort::Value::CreateTensor<float>(memoryInfo, scales.data(), scales.size(),
scalesShape.data(), scalesShape.size()));
// Add speaker id.
// NOTE: These must be kept outside the "if" below to avoid being deallocated.
vector<int64_t> speakerId{(int64_t)synthesisConfig.speakerId.value_or(0)};
vector<int64_t> speakerIdShape{(int64_t)speakerId.size()};
if (synthesisConfig.speakerId) {
inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
memoryInfo, speakerId.data(), speakerId.size(), speakerIdShape.data(),
speakerIdShape.size()));
}
// From export_onnx.py
array<const char *, 4> inputNames = {"input", "input_lengths", "scales",
"sid"};
array<const char *, 1> outputNames = {"output"};
// Infer
auto startTime = chrono::steady_clock::now();
auto outputTensors = session.onnx.Run(
Ort::RunOptions{nullptr}, inputNames.data(), inputTensors.data(),
inputTensors.size(), outputNames.data(), outputNames.size());
auto endTime = chrono::steady_clock::now();
if ((outputTensors.size() != 1) || (!outputTensors.front().IsTensor())) {
throw runtime_error("Invalid output tensors");
}
auto inferDuration = chrono::duration<double>(endTime - startTime);
result.inferSeconds = inferDuration.count();
const float *audio = outputTensors.front().GetTensorData<float>();
auto audioShape =
outputTensors.front().GetTensorTypeAndShapeInfo().GetShape();
int64_t audioCount = audioShape[audioShape.size() - 1];
result.audioSeconds = (double)audioCount / (double)synthesisConfig.sampleRate;
result.realTimeFactor = 0.0;
if (result.audioSeconds > 0) {
result.realTimeFactor = result.inferSeconds / result.audioSeconds;
}
// Get max audio value for scaling
float maxAudioValue = 0.01f;
for (int64_t i = 0; i < audioCount; i++) {
float audioValue = abs(audio[i]);
if (audioValue > maxAudioValue) {
maxAudioValue = audioValue;
}
}
// We know the size up front
audioBuffer.reserve(audioCount);
// Scale audio to fill range and convert to int16
float audioScale = (MAX_WAV_VALUE / max(0.01f, maxAudioValue));
for (int64_t i = 0; i < audioCount; i++) {
int16_t intAudioValue = static_cast<int16_t>(
clamp(audio[i] * audioScale,
static_cast<float>(numeric_limits<int16_t>::min()),
static_cast<float>(numeric_limits<int16_t>::max())));
audioBuffer.push_back(intAudioValue);
}
// Clean up
for (size_t i = 0; i < outputTensors.size(); i++) {
Ort::detail::OrtRelease(outputTensors[i].release());
}
for (size_t i = 0; i < inputTensors.size(); i++) {
Ort::detail::OrtRelease(inputTensors[i].release());
}
}
} // namespace piper
#endif // SYNTHESIZE_H_