forked from Tomiinek/Multilingual_Text_to_Speech
-
Notifications
You must be signed in to change notification settings - Fork 0
/
synthesize.py
122 lines (92 loc) · 5.64 KB
/
synthesize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import sys
import os
from datetime import datetime
import numpy as np
import torch
from utils import audio, text
from utils import build_model
from params.params import Params as hp
from modules.tacotron2 import Tacotron
"""
******************************************************** INSTRUCTIONS ********************************************************
* *
* The script expects input utterances on stdin, every example on a separate line. *
* *
* Different models expect different lines, some have to specify speaker, language, etc.: *
* ID is used as name of the output file. *
* Speaker and language IDs have to be the same as in parameters (see hp.languages and hp.speakers). *
* *
* MONO-lingual and SINGLE-speaker: id|single input utterance per line *
* OTHERWISE id|single input utterance|speaker|language *
* OTHERWISE with PER-CHARACTER lang: id|single input utterance|speaker|l1-(length of l1),l2-(length of l2),l1 *
* where the last language takes all remaining character *
* exmaple: "01|guten tag jean-paul.|speaker|de-10,fr-9,de" *
* OTHERWISE with accent control: id|single input utterance|speaker|l1-(len1),l2*0.75:l3*0.25-(len2),l1 *
* accent can be controlled by weighting per-language characters *
* language codes must be separated by : and weights are assigned using '*number' *
* example: "01|guten tag jean-paul.|speaker|de-10,fr*0.75:de*0.25-9,de" *
* the numbers do not have to sum up to one because they are normalized later *
* *
******************************************************************************************************************************
"""
def synthesize(model, input_data, force_cpu=False):
item = input_data.split('|')
clean_text = item[1]
if not hp.use_punctuation:
clean_text = text.remove_punctuation(clean_text)
if not hp.case_sensitive:
clean_text = text.to_lower(clean_text)
if hp.remove_multiple_wspaces:
clean_text = text.remove_odd_whitespaces(clean_text)
t = torch.LongTensor(text.to_sequence(clean_text, use_phonemes=hp.use_phonemes))
if hp.multi_language:
l_tokens = item[3].split(',')
t_length = len(clean_text) + 1
l = []
for token in l_tokens:
l_d = token.split('-')
language = [0] * hp.language_number
for l_cw in l_d[0].split(':'):
l_cw_s = l_cw.split('*')
language[hp.languages.index(l_cw_s[0])] = 1 if len(l_cw_s) == 1 else float(l_cw_s[1])
language_length = (int(l_d[1]) if len(l_d) == 2 else t_length)
l += [language] * language_length
t_length -= language_length
l = torch.FloatTensor([l])
else:
l = None
s = torch.LongTensor([hp.unique_speakers.index(item[2])]) if hp.multi_speaker else None
if torch.cuda.is_available() and not force_cpu:
t = t.cuda(non_blocking=True)
if l is not None: l = l.cuda(non_blocking=True)
if s is not None: s = s.cuda(non_blocking=True)
s = model.inference(t, speaker=s, language=l).cpu().detach().numpy()
s = audio.denormalize_spectrogram(s, not hp.predict_linear)
return s
if __name__ == '__main__':
import argparse
import re
parser = argparse.ArgumentParser()
parser.add_argument("--checkpoint", type=str, required=True, help="Model checkpoint.")
parser.add_argument("--output", type=str, default=".", help="Path to output directory.")
parser.add_argument("--cpu", action='store_true', help="Force to run on CPU.")
parser.add_argument("--save_spec", action='store_true', help="Saves also spectrograms if set.")
parser.add_argument("--ignore_wav", action='store_true', help="Does not save waveforms if set.")
args = parser.parse_args()
print("Building model ...")
model = build_model(args.checkpoint, args.cpu)
model.eval()
#total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
#print(f"Builded model with {total_params} parameters")
inputs = [l.rstrip() for l in sys.stdin.readlines() if l]
spectrograms = []
for i, item in enumerate(inputs):
print(f'Synthesizing({i+1}/{len(inputs)}): "{item[1]}"')
s = synthesize(model, item[1], args.cpu)
if not os.path.exists(args.output):
os.makedirs(args.output)
if args.save_spec:
np.save(os.path.join(args.output, f'{item[0]}.npy'), s)
if not args.ignore_wav:
w = audio.inverse_spectrogram(s, not hp.predict_linear)
audio.save(w, os.path.join(args.output, f'{item[0]}.wav'))