diff --git a/bert_vits2/attentions.py b/bert_vits2/attentions.py index 80df44f..a027094 100644 --- a/bert_vits2/attentions.py +++ b/bert_vits2/attentions.py @@ -42,11 +42,11 @@ def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_s self.kernel_size = kernel_size self.p_dropout = p_dropout self.window_size = window_size - if isflow: - cond_layer = torch.nn.Conv1d(256, 2 * hidden_channels * n_layers, 1) - self.cond_pre = torch.nn.Conv1d(hidden_channels, 2 * hidden_channels, 1) - self.cond_layer = weight_norm(cond_layer, name='weight') - self.gin_channels = 256 + # if isflow: + # cond_layer = torch.nn.Conv1d(256, 2 * hidden_channels * n_layers, 1) + # self.cond_pre = torch.nn.Conv1d(hidden_channels, 2 * hidden_channels, 1) + # self.cond_layer = weight_norm(cond_layer, name='weight') + # self.gin_channels = 256 self.cond_layer_idx = self.n_layers if 'gin_channels' in kwargs: self.gin_channels = kwargs['gin_channels'] diff --git a/bert_vits2/models.py b/bert_vits2/models.py index e4d04da..ce25763 100644 --- a/bert_vits2/models.py +++ b/bert_vits2/models.py @@ -643,8 +643,8 @@ def __init__(self, self.sdp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels) self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels) - if n_speakers >= 1: - self.emb_g = nn.Embedding(n_speakers, gin_channels) + if self.n_speakers > 0: + self.emb_g = nn.Embedding(self.n_speakers, gin_channels) else: self.ref_enc = ReferenceEncoder(spec_channels, gin_channels)