Merge remote-tracking branch 'pumpkin/preprocess'

pull/172/head
Michael Hansen 11 months ago
commit 4cca517199

@ -1,372 +0,0 @@
import argparse
import json
import sys
import unicodedata
from collections import Counter
from enum import Enum
from typing import Dict, Iterable, List, Mapping, Optional
from espeak_phonemizer import Phonemizer
class PhonemeType(str, Enum):
ESPEAK = "espeak"
"""Phonemes come from espeak-ng"""
TEXT = "text"
"""Phonemes come from text itself"""
MAX_PHONEMES = 256
DEFAULT_PHONEME_ID_MAP: Dict[str, List[int]] = {
"_": [0],
"^": [1],
"$": [2],
" ": [3],
"!": [4],
"'": [5],
"(": [6],
")": [7],
",": [8],
"-": [9],
".": [10],
":": [11],
";": [12],
"?": [13],
"a": [14],
"b": [15],
"c": [16],
"d": [17],
"e": [18],
"f": [19],
"h": [20],
"i": [21],
"j": [22],
"k": [23],
"l": [24],
"m": [25],
"n": [26],
"o": [27],
"p": [28],
"q": [29],
"r": [30],
"s": [31],
"t": [32],
"u": [33],
"v": [34],
"w": [35],
"x": [36],
"y": [37],
"z": [38],
"æ": [39],
"ç": [40],
"ð": [41],
"ø": [42],
"ħ": [43],
"ŋ": [44],
"œ": [45],
"ǀ": [46],
"ǁ": [47],
"ǂ": [48],
"ǃ": [49],
"ɐ": [50],
"ɑ": [51],
"ɒ": [52],
"ɓ": [53],
"ɔ": [54],
"ɕ": [55],
"ɖ": [56],
"ɗ": [57],
"ɘ": [58],
"ə": [59],
"ɚ": [60],
"ɛ": [61],
"ɜ": [62],
"ɞ": [63],
"ɟ": [64],
"ɠ": [65],
"ɡ": [66],
"ɢ": [67],
"ɣ": [68],
"ɤ": [69],
"ɥ": [70],
"ɦ": [71],
"ɧ": [72],
"ɨ": [73],
"ɪ": [74],
"ɫ": [75],
"ɬ": [76],
"ɭ": [77],
"ɮ": [78],
"ɯ": [79],
"ɰ": [80],
"ɱ": [81],
"ɲ": [82],
"ɳ": [83],
"ɴ": [84],
"ɵ": [85],
"ɶ": [86],
"ɸ": [87],
"ɹ": [88],
"ɺ": [89],
"ɻ": [90],
"ɽ": [91],
"ɾ": [92],
"ʀ": [93],
"ʁ": [94],
"ʂ": [95],
"ʃ": [96],
"ʄ": [97],
"ʈ": [98],
"ʉ": [99],
"ʊ": [100],
"ʋ": [101],
"ʌ": [102],
"ʍ": [103],
"ʎ": [104],
"ʏ": [105],
"ʐ": [106],
"ʑ": [107],
"ʒ": [108],
"ʔ": [109],
"ʕ": [110],
"ʘ": [111],
"ʙ": [112],
"ʛ": [113],
"ʜ": [114],
"ʝ": [115],
"ʟ": [116],
"ʡ": [117],
"ʢ": [118],
"ʲ": [119],
"ˈ": [120],
"ˌ": [121],
"ː": [122],
"ˑ": [123],
"˞": [124],
"β": [125],
"θ": [126],
"χ": [127],
"": [128],
"": [129],
"0": [130], # tones
"1": [131],
"2": [132],
"3": [133],
"4": [134],
"5": [135],
"6": [136],
"7": [137],
"8": [138],
"9": [139],
"\u0327": [140], # combining cedilla
"\u0303": [141], # combining tilde
"\u032a": [142], # combining bridge below
"\u032f": [143], # combining inverted breve below
"\u0329": [144], # combining vertical line below
"ʰ": [145],
"ˤ": [146],
"ε": [147],
"": [148],
"#": [149], # Icelandic
'"': [150], # Russian
"": [151],
"\u033a": [152], # Basque
"\u033b": [153],
}
PHONEME_MAPS = {
# Brazilian Portuguese
"pt-br": {"c": ["k"]}
}
ALPHABETS = {
# Ukrainian
"uk": {
"_": [0],
"^": [1],
"$": [2],
" ": [3],
"!": [4],
"'": [5],
",": [6],
"-": [7],
".": [8],
":": [9],
";": [10],
"?": [11],
"а": [12],
"б": [13],
"в": [14],
"г": [15],
"ґ": [16],
"д": [17],
"е": [18],
"є": [19],
"ж": [20],
"з": [21],
"и": [22],
"і": [23],
"ї": [24],
"й": [25],
"к": [26],
"л": [27],
"м": [28],
"н": [29],
"о": [30],
"п": [31],
"р": [32],
"с": [33],
"т": [34],
"у": [35],
"ф": [36],
"х": [37],
"ц": [38],
"ч": [39],
"ш": [40],
"щ": [41],
"ь": [42],
"ю": [43],
"я": [44],
"\u0301": [45], # combining acute accent
"\u0306": [46], # combining breve
"\u0308": [47], # combining diaeresis
"": [48], # em dash
}
}
def phonemize(
text: str,
phonemizer: Phonemizer,
phoneme_map: Optional[Dict[str, List[str]]] = None,
) -> List[str]:
phonemes_str = phonemizer.phonemize(text=text, keep_clause_breakers=True)
# Phonemes are decomposed into unicode codepoints
unmapped_phonemes = list(unicodedata.normalize("NFD", phonemes_str))
if not phoneme_map:
return unmapped_phonemes
# Phonemes can be mapped to lists of other phonemes
mapped_phonemes = []
for phoneme in unmapped_phonemes:
sub_phonemes = phoneme_map.get(phoneme)
if sub_phonemes:
mapped_phonemes.extend(sub_phonemes)
else:
mapped_phonemes.append(phoneme)
return mapped_phonemes
def phonemes_to_ids(
phonemes: Iterable[str],
phoneme_id_map: Optional[Mapping[str, Iterable[int]]] = None,
missing_phonemes: "Optional[Counter[str]]" = None,
pad: Optional[str] = "_",
bos: Optional[str] = "^",
eos: Optional[str] = "$",
) -> List[int]:
if phoneme_id_map is None:
phoneme_id_map = DEFAULT_PHONEME_ID_MAP
phoneme_ids: List[int] = []
if bos:
phoneme_ids.extend(phoneme_id_map[bos])
if pad:
phoneme_ids.extend(phoneme_id_map[pad])
for phoneme in phonemes:
mapped_phoneme_ids = phoneme_id_map.get(phoneme)
if mapped_phoneme_ids:
phoneme_ids.extend(mapped_phoneme_ids)
if pad:
phoneme_ids.extend(phoneme_id_map[pad])
elif missing_phonemes is not None:
# Make note of missing phonemes
missing_phonemes[phoneme] += 1
if eos:
phoneme_ids.extend(phoneme_id_map[eos])
return phoneme_ids
# -----------------------------------------------------------------------------
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("language")
parser.add_argument(
"--phoneme-type",
choices=list(PhonemeType),
default=PhonemeType.ESPEAK,
help="Type of phonemes to use (default: espeak)",
)
parser.add_argument(
"--text-casing",
choices=("ignore", "lower", "upper", "casefold"),
default="ignore",
help="Casing applied to utterance text",
)
args = parser.parse_args()
phonemizer: Optional[Phonemizer] = None
if args.text_casing == "lower":
casing = str.lower
elif args.text_casing == "upper":
casing = str.upper
else:
# ignore
casing = lambda s: s
if args.phoneme_type == PhonemeType.TEXT:
# Use text directly
phoneme_id_map = ALPHABETS[args.language]
else:
# Use eSpeak
phonemizer = Phonemizer(args.language)
phoneme_id_map = DEFAULT_PHONEME_ID_MAP
phoneme_map = PHONEME_MAPS.get(args.language)
missing_phonemes: "Counter[str]" = Counter()
for line in sys.stdin:
line = line.strip()
if not line:
continue
if args.phoneme_type == PhonemeType.TEXT:
phonemes = list(unicodedata.normalize("NFD", casing(line)))
else:
assert phonemizer is not None
phonemes = phonemize(line, phonemizer, phoneme_map=phoneme_map)
phoneme_ids = phonemes_to_ids(
phonemes, phoneme_id_map=phoneme_id_map, missing_phonemes=missing_phonemes
)
json.dump(
{
"text": line,
"phonemes": phonemes,
"phoneme_ids": phoneme_ids,
},
sys.stdout,
ensure_ascii=False,
)
print("")
if missing_phonemes:
print("Missing", len(missing_phonemes), "phonemes", file=sys.stderr)
for phoneme, count in missing_phonemes.most_common():
print(phoneme, count, file=sys.stderr)
if __name__ == "__main__":
main()

@ -9,28 +9,37 @@ import os
import unicodedata
from collections import Counter
from dataclasses import dataclass, field
from enum import Enum
from multiprocessing import JoinableQueue, Process, Queue
from pathlib import Path
from typing import Dict, Iterable, List, Optional
from espeak_phonemizer import Phonemizer
from piper_phonemize import (
phonemize_espeak,
phonemize_codepoints,
phoneme_ids_espeak,
phoneme_ids_codepoints,
get_codepoints_map,
get_espeak_map,
get_max_phonemes,
tashkeel_run,
)
from .norm_audio import cache_norm_audio, make_silence_detector
from .phonemize import (
ALPHABETS,
DEFAULT_PHONEME_ID_MAP,
MAX_PHONEMES,
PHONEME_MAPS,
PhonemeType,
phonemes_to_ids,
phonemize,
)
_DIR = Path(__file__).parent
_VERSION = (_DIR / "VERSION").read_text(encoding="utf-8").strip()
_LOGGER = logging.getLogger("preprocess")
class PhonemeType(str, Enum):
ESPEAK = "espeak"
"""Phonemes come from espeak-ng"""
TEXT = "text"
"""Phonemes come from text itself"""
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument(
@ -150,10 +159,10 @@ def main() -> None:
"inference": {"noise_scale": 0.667, "length_scale": 1, "noise_w": 0.8},
"phoneme_type": args.phoneme_type.value,
"phoneme_map": {},
"phoneme_id_map": ALPHABETS[args.language]
"phoneme_id_map": get_codepoints_map()[args.language]
if args.phoneme_type == PhonemeType.TEXT
else DEFAULT_PHONEME_ID_MAP,
"num_symbols": MAX_PHONEMES,
else get_espeak_map(),
"num_symbols": get_max_phonemes(),
"num_speakers": len(speaker_counts),
"speaker_id_map": speaker_ids,
"piper_version": _VERSION,
@ -255,8 +264,6 @@ def phonemize_batch_espeak(
try:
casing = get_text_casing(args.text_casing)
silence_detector = make_silence_detector()
phonemizer = Phonemizer(default_voice=args.language)
phoneme_map = PHONEME_MAPS.get(args.language)
while True:
utt_batch = queue_in.get()
@ -266,10 +273,15 @@ def phonemize_batch_espeak(
for utt in utt_batch:
try:
_LOGGER.debug(utt)
utt.phonemes = phonemize(
casing(utt.text), phonemizer, phoneme_map=phoneme_map
)
utt.phoneme_ids = phonemes_to_ids(
all_phonemes = phonemize_espeak(casing(utt.text), args.language)
# Flatten
utt.phonemes = [
phoneme
for sentence_phonemes in all_phonemes
for phoneme in sentence_phonemes
]
utt.phoneme_ids = phoneme_ids_espeak(
utt.phonemes,
missing_phonemes=utt.missing_phonemes,
)
@ -298,7 +310,6 @@ def phonemize_batch_text(
try:
casing = get_text_casing(args.text_casing)
silence_detector = make_silence_detector()
alphabet = ALPHABETS[args.language]
while True:
utt_batch = queue_in.get()
@ -308,10 +319,16 @@ def phonemize_batch_text(
for utt in utt_batch:
try:
_LOGGER.debug(utt)
utt.phonemes = list(unicodedata.normalize("NFD", casing(utt.text)))
utt.phoneme_ids = phonemes_to_ids(
all_phonemes = phonemize_codepoints(casing(utt.text))
# Flatten
utt.phonemes = [
phoneme
for sentence_phonemes in all_phonemes
for phoneme in sentence_phonemes
]
utt.phoneme_ids = phoneme_ids_codepoints(
args.language,
utt.phonemes,
phoneme_id_map=alphabet,
missing_phonemes=utt.missing_phonemes,
)
if not args.skip_audio:

@ -1,7 +1,7 @@
cython>=0.29.0,<1
espeak-phonemizer>=1.1.0,<2
piper-phonemize~=1.0.0
librosa>=0.9.2,<1
numpy>=1.19.0
onnxruntime~=1.11.0
onnxruntime>=1.11.0
pytorch-lightning~=1.7.0
torch~=1.11.0
torch>=1.11.0,<2

Loading…
Cancel
Save