-
Notifications
You must be signed in to change notification settings - Fork 209
/
make_metadata.py
59 lines (51 loc) · 2 KB
/
make_metadata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
"""
Generate speaker embeddings and metadata for training
"""
import os
import pickle
from model_bl import D_VECTOR
from collections import OrderedDict
import numpy as np
import torch
C = D_VECTOR(dim_input=80, dim_cell=768, dim_emb=256).eval().cuda()
c_checkpoint = torch.load('3000000-BL.ckpt')
new_state_dict = OrderedDict()
for key, val in c_checkpoint['model_b'].items():
new_key = key[7:]
new_state_dict[new_key] = val
C.load_state_dict(new_state_dict)
num_uttrs = 10
len_crop = 128
# Directory containing mel-spectrograms
rootDir = './spmel'
dirName, subdirList, _ = next(os.walk(rootDir))
print('Found directory: %s' % dirName)
speakers = []
for speaker in sorted(subdirList):
print('Processing speaker: %s' % speaker)
utterances = []
utterances.append(speaker)
_, _, fileList = next(os.walk(os.path.join(dirName,speaker)))
# make speaker embedding
assert len(fileList) >= num_uttrs
idx_uttrs = np.random.choice(len(fileList), size=num_uttrs, replace=False)
embs = []
for i in range(num_uttrs):
tmp = np.load(os.path.join(dirName, speaker, fileList[idx_uttrs[i]]))
candidates = np.delete(np.arange(len(fileList)), idx_uttrs)
# choose another utterance if the current one is too short
while tmp.shape[0] < len_crop:
idx_alt = np.random.choice(candidates)
tmp = np.load(os.path.join(dirName, speaker, fileList[idx_alt]))
candidates = np.delete(candidates, np.argwhere(candidates==idx_alt))
left = np.random.randint(0, tmp.shape[0]-len_crop)
melsp = torch.from_numpy(tmp[np.newaxis, left:left+len_crop, :]).cuda()
emb = C(melsp)
embs.append(emb.detach().squeeze().cpu().numpy())
utterances.append(np.mean(embs, axis=0))
# create file list
for fileName in sorted(fileList):
utterances.append(os.path.join(speaker,fileName))
speakers.append(utterances)
with open(os.path.join(rootDir, 'train.pkl'), 'wb') as handle:
pickle.dump(speakers, handle)