diff --git a/app.py b/app.py index 5dd7547..d3d7167 100644 --- a/app.py +++ b/app.py @@ -7,7 +7,7 @@ from flask_apscheduler import APScheduler from functools import wraps from utils.utils import clean_folder, check_is_none -from utils.merge import merge_model +from utils.load_model import merge_model from io import BytesIO app = Flask(__name__) diff --git a/bert_vits2/bert_vits2.py b/bert_vits2/bert_vits2.py index 9811360..9e37c1b 100644 --- a/bert_vits2/bert_vits2.py +++ b/bert_vits2/bert_vits2.py @@ -7,7 +7,8 @@ from bert_vits2.models import SynthesizerTrn from bert_vits2.text import symbols, cleaned_text_to_sequence, get_bert from bert_vits2.text.cleaner import clean_text -from utils.nlp import sentence_split, cut +from bert_vits2.text.symbols import get_symbols +from utils.sentence import sentence_split, cut class Bert_VITS2: @@ -16,11 +17,20 @@ def __init__(self, model, config, device=torch.device("cpu")): self.n_speakers = getattr(self.hps_ms.data, 'n_speakers', 0) self.speakers = [item[0] for item in sorted(list(getattr(self.hps_ms.data, 'spk2id', {'0': 0}).items()), key=lambda x: x[1])] + + self.legacy = getattr(self.hps_ms.data, 'legacy', False) + symbols, num_tones, self.language_id_map, num_languages, self.language_tone_start_map = get_symbols( + legacy=self.legacy) + self._symbol_to_id = {s: i for i, s in enumerate(symbols)} + self.net_g = SynthesizerTrn( len(symbols), self.hps_ms.data.filter_length // 2 + 1, self.hps_ms.train.segment_size // self.hps_ms.data.hop_length, n_speakers=self.hps_ms.data.n_speakers, + symbols=symbols, + num_tones=num_tones, + num_languages=num_languages, **self.hps_ms.model).to(device) _ = self.net_g.eval() self.device = device @@ -35,7 +45,8 @@ def get_speakers(self): def get_text(self, text, language_str, hps): norm_text, phone, tone, word2ph = clean_text(text, language_str) # print([f"{p}{t}" for p, t in zip(phone, tone)]) - phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str) + phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str, self._symbol_to_id, + self.language_tone_start_map, self.language_id_map) if hps.data.add_blank: phone = commons.intersperse(phone, 0) diff --git a/bert_vits2/models.py b/bert_vits2/models.py index ce25763..f8f01cd 100644 --- a/bert_vits2/models.py +++ b/bert_vits2/models.py @@ -11,7 +11,6 @@ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm from bert_vits2.commons import init_weights, get_padding -from bert_vits2.text import symbols, num_tones, num_languages class DurationDiscriminator(nn.Module): # vits2 @@ -254,7 +253,10 @@ def __init__(self, n_layers, kernel_size, p_dropout, - gin_channels=0): + gin_channels=0, + symbols=None, + num_tones=None, + num_languages=None): super().__init__() self.n_vocab = n_vocab self.out_channels = out_channels @@ -620,6 +622,9 @@ def __init__(self, self.current_mas_noise_scale = self.mas_noise_scale_initial if self.use_spk_conditioned_encoder and gin_channels > 0: self.enc_gin_channels = gin_channels + symbols = kwargs.get("symbols") + num_tones = kwargs.get("num_tones") + num_languages = kwargs.get("num_languages") self.enc_p = TextEncoder(n_vocab, inter_channels, hidden_channels, @@ -628,7 +633,11 @@ def __init__(self, n_layers, kernel_size, p_dropout, - gin_channels=self.enc_gin_channels) + gin_channels=self.enc_gin_channels, + symbols=symbols, + num_tones=num_tones, + num_languages=num_languages + ) self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels) self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, diff --git a/bert_vits2/text/__init__.py b/bert_vits2/text/__init__.py index 3cf3a71..7d81c25 100644 --- a/bert_vits2/text/__init__.py +++ b/bert_vits2/text/__init__.py @@ -1,17 +1,12 @@ -from bert_vits2.text.symbols import * -from .chinese_bert import get_bert_feature as zh_bert -from .english_bert_mock import get_bert_feature as en_bert +from bert_vits2.text.symbols import punctuation -_symbol_to_id = {s: i for i, s in enumerate(symbols)} - - -def cleaned_text_to_sequence(cleaned_text, tones, language): - '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. - Args: - text: string to convert to a sequence - Returns: - List of integers corresponding to the symbols in the text - ''' +def cleaned_text_to_sequence(cleaned_text, tones, language, _symbol_to_id, language_tone_start_map, language_id_map): + """Converts a string of text to a sequence of IDs corresponding to the symbols in the text. + Args: + text: string to convert to a sequence + Returns: + List of integers corresponding to the symbols in the text + """ phones = [_symbol_to_id[symbol] for symbol in cleaned_text] tone_start = language_tone_start_map[language] tones = [i + tone_start for i in tones] @@ -21,9 +16,15 @@ def cleaned_text_to_sequence(cleaned_text, tones, language): def get_bert(norm_text, word2ph, language): - lang_bert_func_map = { - 'ZH': zh_bert, - 'EN': en_bert - } - bert = lang_bert_func_map[language](norm_text, word2ph) + if language == "ZH": + from .chinese_bert import get_bert_feature as zh_bert + lang_bert_func = zh_bert + elif language == "EN": + from .english_bert_mock import get_bert_feature as en_bert + lang_bert_func = en_bert + elif language == "JP": + from .japanese_bert import get_bert_feature as jp_bert + lang_bert_func = jp_bert + + bert = lang_bert_func(norm_text, word2ph) return bert diff --git a/bert_vits2/text/chinese_bert.py b/bert_vits2/text/chinese_bert.py index 3079c47..dae88a1 100644 --- a/bert_vits2/text/chinese_bert.py +++ b/bert_vits2/text/chinese_bert.py @@ -3,20 +3,18 @@ from transformers import AutoTokenizer, AutoModelForMaskedLM from logger import logger -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - try: logger.info("Loading chinese-roberta-wwm-ext-large...") tokenizer = AutoTokenizer.from_pretrained(config.ABS_PATH + "/bert_vits2/bert/chinese-roberta-wwm-ext-large") model = AutoModelForMaskedLM.from_pretrained(config.ABS_PATH + "/bert_vits2/bert/chinese-roberta-wwm-ext-large").to( - device) + config.DEVICE) logger.info("Loading finished.") except Exception as e: logger.error(e) - logger.error(f"Please download model from hfl/chinese-roberta-wwm-ext-large.") + logger.error(f"Please download pytorch_model.bin from hfl/chinese-roberta-wwm-ext-large.") -def get_bert_feature(text, word2ph): +def get_bert_feature(text, word2ph, device=config.DEVICE): with torch.no_grad(): inputs = tokenizer(text, return_tensors='pt') for i in inputs: @@ -37,7 +35,6 @@ def get_bert_feature(text, word2ph): if __name__ == '__main__': - # feature = get_bert_feature('你好,我是说的道理。') import torch word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征 diff --git a/bert_vits2/text/japanese.py b/bert_vits2/text/japanese.py index d92633e..7f388e2 100644 --- a/bert_vits2/text/japanese.py +++ b/bert_vits2/text/japanese.py @@ -1,104 +1,584 @@ -# modified from https://github.com/CjangCjengh/vits/blob/main/text/japanese.py +# Convert Japanese text to phonemes which is +# compatible with Julius https://github.com/julius-speech/segmentation-kit import re -import sys - -import pyopenjtalk - -from bert_vits2.text import symbols - -# Regular expression matching Japanese without punctuation marks: -_japanese_characters = re.compile( - r'[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]') - -# Regular expression matching non-Japanese characters or punctuation marks: -_japanese_marks = re.compile( - r'[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]') - -# List of (symbol, Japanese) pairs for marks: -_symbols_to_japanese = [(re.compile('%s' % x[0]), x[1]) for x in [ - ('%', 'パーセント') -]] - -# List of (consonant, sokuon) pairs: -_real_sokuon = [(re.compile('%s' % x[0]), x[1]) for x in [ - (r'Q([↑↓]*[kg])', r'k#\1'), - (r'Q([↑↓]*[tdjʧ])', r't#\1'), - (r'Q([↑↓]*[sʃ])', r's\1'), - (r'Q([↑↓]*[pb])', r'p#\1') -]] - -# List of (consonant, hatsuon) pairs: -_real_hatsuon = [(re.compile('%s' % x[0]), x[1]) for x in [ - (r'N([↑↓]*[pbm])', r'm\1'), - (r'N([↑↓]*[ʧʥj])', r'n^\1'), - (r'N([↑↓]*[tdn])', r'n\1'), - (r'N([↑↓]*[kg])', r'ŋ\1') -]] - - -def post_replace_ph(ph): - rep_map = { - ':': ',', - ';': ',', - ',': ',', - '。': '.', - '!': '!', - '?': '?', - '\n': '.', - "·": ",", - '、': ",", - '...': '…', - 'v': "V" - } - if ph in rep_map.keys(): - ph = rep_map[ph] - if ph in symbols: - return ph - if ph not in symbols: - ph = 'UNK' - return ph - - -def symbols_to_japanese(text): - for regex, replacement in _symbols_to_japanese: - text = re.sub(regex, replacement, text) - return text - - -def preprocess_jap(text): - '''Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html''' - text = symbols_to_japanese(text) - sentences = re.split(_japanese_marks, text) - marks = re.findall(_japanese_marks, text) - text = [] - for i, sentence in enumerate(sentences): - if re.match(_japanese_characters, sentence): - p = pyopenjtalk.g2p(sentence) - text += p.split(" ") - - if i < len(marks): - text += [marks[i].replace(' ', '')] - return text +import unicodedata + +from transformers import AutoTokenizer + +from bert_vits2.text import punctuation, symbols +from bert_vits2.text.japanese_bert import tokenizer + +try: + import MeCab +except ImportError as e: + raise ImportError("Japanese requires mecab-python3 and unidic-lite.") from e +from num2words import num2words + +_CONVRULES = [ + # Conversion of 2 letters + "アァ/ a a", + "イィ/ i i", + "イェ/ i e", + "イャ/ y a", + "ウゥ/ u:", + "エェ/ e e", + "オォ/ o:", + "カァ/ k a:", + "キィ/ k i:", + "クゥ/ k u:", + "クャ/ ky a", + "クュ/ ky u", + "クョ/ ky o", + "ケェ/ k e:", + "コォ/ k o:", + "ガァ/ g a:", + "ギィ/ g i:", + "グゥ/ g u:", + "グャ/ gy a", + "グュ/ gy u", + "グョ/ gy o", + "ゲェ/ g e:", + "ゴォ/ g o:", + "サァ/ s a:", + "シィ/ sh i:", + "スゥ/ s u:", + "スャ/ sh a", + "スュ/ sh u", + "スョ/ sh o", + "セェ/ s e:", + "ソォ/ s o:", + "ザァ/ z a:", + "ジィ/ j i:", + "ズゥ/ z u:", + "ズャ/ zy a", + "ズュ/ zy u", + "ズョ/ zy o", + "ゼェ/ z e:", + "ゾォ/ z o:", + "タァ/ t a:", + "チィ/ ch i:", + "ツァ/ ts a", + "ツィ/ ts i", + "ツゥ/ ts u:", + "ツャ/ ch a", + "ツュ/ ch u", + "ツョ/ ch o", + "ツェ/ ts e", + "ツォ/ ts o", + "テェ/ t e:", + "トォ/ t o:", + "ダァ/ d a:", + "ヂィ/ j i:", + "ヅゥ/ d u:", + "ヅャ/ zy a", + "ヅュ/ zy u", + "ヅョ/ zy o", + "デェ/ d e:", + "ドォ/ d o:", + "ナァ/ n a:", + "ニィ/ n i:", + "ヌゥ/ n u:", + "ヌャ/ ny a", + "ヌュ/ ny u", + "ヌョ/ ny o", + "ネェ/ n e:", + "ノォ/ n o:", + "ハァ/ h a:", + "ヒィ/ h i:", + "フゥ/ f u:", + "フャ/ hy a", + "フュ/ hy u", + "フョ/ hy o", + "ヘェ/ h e:", + "ホォ/ h o:", + "バァ/ b a:", + "ビィ/ b i:", + "ブゥ/ b u:", + "フャ/ hy a", + "ブュ/ by u", + "フョ/ hy o", + "ベェ/ b e:", + "ボォ/ b o:", + "パァ/ p a:", + "ピィ/ p i:", + "プゥ/ p u:", + "プャ/ py a", + "プュ/ py u", + "プョ/ py o", + "ペェ/ p e:", + "ポォ/ p o:", + "マァ/ m a:", + "ミィ/ m i:", + "ムゥ/ m u:", + "ムャ/ my a", + "ムュ/ my u", + "ムョ/ my o", + "メェ/ m e:", + "モォ/ m o:", + "ヤァ/ y a:", + "ユゥ/ y u:", + "ユャ/ y a:", + "ユュ/ y u:", + "ユョ/ y o:", + "ヨォ/ y o:", + "ラァ/ r a:", + "リィ/ r i:", + "ルゥ/ r u:", + "ルャ/ ry a", + "ルュ/ ry u", + "ルョ/ ry o", + "レェ/ r e:", + "ロォ/ r o:", + "ワァ/ w a:", + "ヲォ/ o:", + "ディ/ d i", + "デェ/ d e:", + "デャ/ dy a", + "デュ/ dy u", + "デョ/ dy o", + "ティ/ t i", + "テェ/ t e:", + "テャ/ ty a", + "テュ/ ty u", + "テョ/ ty o", + "スィ/ s i", + "ズァ/ z u a", + "ズィ/ z i", + "ズゥ/ z u", + "ズャ/ zy a", + "ズュ/ zy u", + "ズョ/ zy o", + "ズェ/ z e", + "ズォ/ z o", + "キャ/ ky a", + "キュ/ ky u", + "キョ/ ky o", + "シャ/ sh a", + "シュ/ sh u", + "シェ/ sh e", + "ショ/ sh o", + "チャ/ ch a", + "チュ/ ch u", + "チェ/ ch e", + "チョ/ ch o", + "トゥ/ t u", + "トャ/ ty a", + "トュ/ ty u", + "トョ/ ty o", + "ドァ/ d o a", + "ドゥ/ d u", + "ドャ/ dy a", + "ドュ/ dy u", + "ドョ/ dy o", + "ドォ/ d o:", + "ニャ/ ny a", + "ニュ/ ny u", + "ニョ/ ny o", + "ヒャ/ hy a", + "ヒュ/ hy u", + "ヒョ/ hy o", + "ミャ/ my a", + "ミュ/ my u", + "ミョ/ my o", + "リャ/ ry a", + "リュ/ ry u", + "リョ/ ry o", + "ギャ/ gy a", + "ギュ/ gy u", + "ギョ/ gy o", + "ヂェ/ j e", + "ヂャ/ j a", + "ヂュ/ j u", + "ヂョ/ j o", + "ジェ/ j e", + "ジャ/ j a", + "ジュ/ j u", + "ジョ/ j o", + "ビャ/ by a", + "ビュ/ by u", + "ビョ/ by o", + "ピャ/ py a", + "ピュ/ py u", + "ピョ/ py o", + "ウァ/ u a", + "ウィ/ w i", + "ウェ/ w e", + "ウォ/ w o", + "ファ/ f a", + "フィ/ f i", + "フゥ/ f u", + "フャ/ hy a", + "フュ/ hy u", + "フョ/ hy o", + "フェ/ f e", + "フォ/ f o", + "ヴァ/ b a", + "ヴィ/ b i", + "ヴェ/ b e", + "ヴォ/ b o", + "ヴュ/ by u", + # Conversion of 1 letter + "ア/ a", + "イ/ i", + "ウ/ u", + "エ/ e", + "オ/ o", + "カ/ k a", + "キ/ k i", + "ク/ k u", + "ケ/ k e", + "コ/ k o", + "サ/ s a", + "シ/ sh i", + "ス/ s u", + "セ/ s e", + "ソ/ s o", + "タ/ t a", + "チ/ ch i", + "ツ/ ts u", + "テ/ t e", + "ト/ t o", + "ナ/ n a", + "ニ/ n i", + "ヌ/ n u", + "ネ/ n e", + "ノ/ n o", + "ハ/ h a", + "ヒ/ h i", + "フ/ f u", + "ヘ/ h e", + "ホ/ h o", + "マ/ m a", + "ミ/ m i", + "ム/ m u", + "メ/ m e", + "モ/ m o", + "ラ/ r a", + "リ/ r i", + "ル/ r u", + "レ/ r e", + "ロ/ r o", + "ガ/ g a", + "ギ/ g i", + "グ/ g u", + "ゲ/ g e", + "ゴ/ g o", + "ザ/ z a", + "ジ/ j i", + "ズ/ z u", + "ゼ/ z e", + "ゾ/ z o", + "ダ/ d a", + "ヂ/ j i", + "ヅ/ z u", + "デ/ d e", + "ド/ d o", + "バ/ b a", + "ビ/ b i", + "ブ/ b u", + "ベ/ b e", + "ボ/ b o", + "パ/ p a", + "ピ/ p i", + "プ/ p u", + "ペ/ p e", + "ポ/ p o", + "ヤ/ y a", + "ユ/ y u", + "ヨ/ y o", + "ワ/ w a", + "ヰ/ i", + "ヱ/ e", + "ヲ/ o", + "ン/ N", + "ッ/ q", + "ヴ/ b u", + "ー/:", + # Try converting broken text + "ァ/ a", + "ィ/ i", + "ゥ/ u", + "ェ/ e", + "ォ/ o", + "ヮ/ w a", + "ォ/ o", + # Symbols + "、/ ,", + "。/ .", + "!/ !", + "?/ ?", + "・/ ,", +] + +_COLON_RX = re.compile(":+") +_REJECT_RX = re.compile("[^ a-zA-Z:,.?]") + + +def _makerulemap(): + l = [tuple(x.split("/")) for x in _CONVRULES] + return tuple({k: v for k, v in l if len(k) == i} for i in (1, 2)) + + +_RULEMAP1, _RULEMAP2 = _makerulemap() + + +def kata2phoneme(text: str) -> str: + """Convert katakana text to phonemes.""" + text = text.strip() + res = [] + while text: + if len(text) >= 2: + x = _RULEMAP2.get(text[:2]) + if x is not None: + text = text[2:] + res += x.split(" ")[1:] + continue + x = _RULEMAP1.get(text[0]) + if x is not None: + text = text[1:] + res += x.split(" ")[1:] + continue + res.append(text[0]) + text = text[1:] + # res = _COLON_RX.sub(":", res) + return res + + +_KATAKANA = "".join(chr(ch) for ch in range(ord("ァ"), ord("ン") + 1)) +_HIRAGANA = "".join(chr(ch) for ch in range(ord("ぁ"), ord("ん") + 1)) +_HIRA2KATATRANS = str.maketrans(_HIRAGANA, _KATAKANA) + + +def hira2kata(text: str) -> str: + text = text.translate(_HIRA2KATATRANS) + return text.replace("う゛", "ヴ") + + +_SYMBOL_TOKENS = set(list("・、。?!")) +_NO_YOMI_TOKENS = set(list("「」『』―()[][]")) +_TAGGER = MeCab.Tagger() + + +def text2kata(text: str) -> str: + parsed = _TAGGER.parse(text) + res = [] + for line in parsed.split("\n"): + if line == "EOS": + break + parts = line.split("\t") + + word, yomi = parts[0], parts[1] + if yomi: + res.append(yomi) + else: + if word in _SYMBOL_TOKENS: + res.append(word) + elif word in ("っ", "ッ"): + res.append("ッ") + elif word in _NO_YOMI_TOKENS: + pass + else: + res.append(word) + return hira2kata("".join(res)) + + +_ALPHASYMBOL_YOMI = { + "#": "シャープ", + "%": "パーセント", + "&": "アンド", + "+": "プラス", + "-": "マイナス", + ":": "コロン", + ";": "セミコロン", + "<": "小なり", + "=": "イコール", + ">": "大なり", + "@": "アット", + "a": "エー", + "b": "ビー", + "c": "シー", + "d": "ディー", + "e": "イー", + "f": "エフ", + "g": "ジー", + "h": "エイチ", + "i": "アイ", + "j": "ジェー", + "k": "ケー", + "l": "エル", + "m": "エム", + "n": "エヌ", + "o": "オー", + "p": "ピー", + "q": "キュー", + "r": "アール", + "s": "エス", + "t": "ティー", + "u": "ユー", + "v": "ブイ", + "w": "ダブリュー", + "x": "エックス", + "y": "ワイ", + "z": "ゼット", + "α": "アルファ", + "β": "ベータ", + "γ": "ガンマ", + "δ": "デルタ", + "ε": "イプシロン", + "ζ": "ゼータ", + "η": "イータ", + "θ": "シータ", + "ι": "イオタ", + "κ": "カッパ", + "λ": "ラムダ", + "μ": "ミュー", + "ν": "ニュー", + "ξ": "クサイ", + "ο": "オミクロン", + "π": "パイ", + "ρ": "ロー", + "σ": "シグマ", + "τ": "タウ", + "υ": "ウプシロン", + "φ": "ファイ", + "χ": "カイ", + "ψ": "プサイ", + "ω": "オメガ", +} + + +_NUMBER_WITH_SEPARATOR_RX = re.compile("[0-9]{1,3}(,[0-9]{3})+") +_CURRENCY_MAP = {"$": "ドル", "¥": "円", "£": "ポンド", "€": "ユーロ"} +_CURRENCY_RX = re.compile(r"([$¥£€])([0-9.]*[0-9])") +_NUMBER_RX = re.compile(r"[0-9]+(\.[0-9]+)?") + + +def japanese_convert_numbers_to_words(text: str) -> str: + res = _NUMBER_WITH_SEPARATOR_RX.sub(lambda m: m[0].replace(",", ""), text) + res = _CURRENCY_RX.sub(lambda m: m[2] + _CURRENCY_MAP.get(m[1], m[1]), res) + res = _NUMBER_RX.sub(lambda m: num2words(m[0], lang="ja"), res) + return res + + +def japanese_convert_alpha_symbols_to_words(text: str) -> str: + return "".join([_ALPHASYMBOL_YOMI.get(ch, ch) for ch in text.lower()]) + + +def japanese_text_to_phonemes(text: str) -> str: + """Convert Japanese text to phonemes.""" + res = unicodedata.normalize("NFKC", text) + res = japanese_convert_numbers_to_words(res) + # res = japanese_convert_alpha_symbols_to_words(res) + res = text2kata(res) + res = kata2phoneme(res) + return res + + +def is_japanese_character(char): + # 定义日语文字系统的 Unicode 范围 + japanese_ranges = [ + (0x3040, 0x309F), # 平假名 + (0x30A0, 0x30FF), # 片假名 + (0x4E00, 0x9FFF), # 汉字 (CJK Unified Ideographs) + (0x3400, 0x4DBF), # 汉字扩展 A + (0x20000, 0x2A6DF), # 汉字扩展 B + # 可以根据需要添加其他汉字扩展范围 + ] + + # 将字符的 Unicode 编码转换为整数 + char_code = ord(char) + + # 检查字符是否在任何一个日语范围内 + for start, end in japanese_ranges: + if start <= char_code <= end: + return True + + return False + + +rep_map = { + ":": ",", + ";": ",", + ",": ",", + "。": ".", + "!": "!", + "?": "?", + "\n": ".", + "·": ",", + "、": ",", + "...": "…", +} + + +def replace_punctuation(text): + pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys())) + + replaced_text = pattern.sub(lambda x: rep_map[x.group()], text) + + replaced_text = re.sub( + r"[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF" + + "".join(punctuation) + + r"]+", + "", + replaced_text, + ) + + return replaced_text def text_normalize(text): - return text + res = unicodedata.normalize("NFKC", text) + res = japanese_convert_numbers_to_words(res) + # res = "".join([i for i in res if is_japanese_character(i)]) + res = replace_punctuation(res) + return res + + +def distribute_phone(n_phone, n_word): + phones_per_word = [0] * n_word + for task in range(n_phone): + min_tasks = min(phones_per_word) + min_index = phones_per_word.index(min_tasks) + phones_per_word[min_index] += 1 + return phones_per_word def g2p(norm_text): - phones = preprocess_jap(norm_text) - phones = [post_replace_ph(i) for i in phones] + tokenized = tokenizer.tokenize(norm_text) + phs = [] + ph_groups = [] + for t in tokenized: + if not t.startswith("#"): + ph_groups.append([t]) + else: + ph_groups[-1].append(t.replace("#", "")) + word2ph = [] + for group in ph_groups: + phonemes = kata2phoneme(text2kata("".join(group))) + # phonemes = [i for i in phonemes if i in symbols] + for i in phonemes: + assert i in symbols, (group, norm_text, tokenized) + phone_len = len(phonemes) + word_len = len(group) + + aaa = distribute_phone(phone_len, word_len) + word2ph += aaa + phs += phonemes + phones = ["_"] + phs + ["_"] tones = [0 for i in phones] - word2ph = [1 for i in phones] + word2ph = [1] + word2ph + [1] return phones, tones, word2ph -if __name__ == '__main__': - for line in open("../../../Downloads/transcript_utf8.txt").readlines(): - text = line.split(":")[1] - phones, tones, word2ph = g2p(text) - for p in phones: - if p == "z": - print(text, phones) - sys.exit(0) +if __name__ == "__main__": + tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3") + text = "hello,こんにちは、世界!……" + from bert_vits2.text.japanese_bert import get_bert_feature + + text = text_normalize(text) + print(text) + phones, tones, word2ph = g2p(text) + bert = get_bert_feature(text, word2ph) + + print(phones, tones, word2ph, bert.shape) diff --git a/bert_vits2/text/symbols.py b/bert_vits2/text/symbols.py index fd3d5db..a964a9f 100644 --- a/bert_vits2/text/symbols.py +++ b/bert_vits2/text/symbols.py @@ -1,52 +1,200 @@ -punctuation = ['!', '?', '…', ",", ".", "'", '-'] +punctuation = ["!", "?", "…", ",", ".", "'", "-"] pu_symbols = punctuation + ["SP", "UNK"] -pad = '_' +pad = "_" # chinese -zh_symbols = ['E', 'En', 'a', 'ai', 'an', 'ang', 'ao', 'b', 'c', 'ch', 'd', 'e', 'ei', 'en', 'eng', 'er', 'f', 'g', 'h', - 'i', 'i0', 'ia', 'ian', 'iang', 'iao', 'ie', 'in', 'ing', 'iong', 'ir', 'iu', 'j', 'k', 'l', 'm', 'n', - 'o', - 'ong', - 'ou', 'p', 'q', 'r', 's', 'sh', 't', 'u', 'ua', 'uai', 'uan', 'uang', 'ui', 'un', 'uo', 'v', 'van', 've', - 'vn', - 'w', 'x', 'y', 'z', 'zh', - "AA", "EE", "OO"] +zh_symbols = [ + "E", + "En", + "a", + "ai", + "an", + "ang", + "ao", + "b", + "c", + "ch", + "d", + "e", + "ei", + "en", + "eng", + "er", + "f", + "g", + "h", + "i", + "i0", + "ia", + "ian", + "iang", + "iao", + "ie", + "in", + "ing", + "iong", + "ir", + "iu", + "j", + "k", + "l", + "m", + "n", + "o", + "ong", + "ou", + "p", + "q", + "r", + "s", + "sh", + "t", + "u", + "ua", + "uai", + "uan", + "uang", + "ui", + "un", + "uo", + "v", + "van", + "ve", + "vn", + "w", + "x", + "y", + "z", + "zh", + "AA", + "EE", + "OO", +] num_zh_tones = 6 # japanese -ja_symbols = ['I', 'N', 'U', 'a', 'b', 'by', 'ch', 'cl', 'd', 'dy', 'e', 'f', 'g', 'gy', 'h', 'hy', 'i', 'j', 'k', 'ky', - 'm', 'my', 'n', 'ny', 'o', 'p', 'py', 'r', 'ry', 's', 'sh', 't', 'ts', 'u', 'V', 'w', 'y', 'z'] +ja_symbols_legacy = ['I', 'N', 'U', 'a', 'b', 'by', 'ch', 'cl', 'd', 'dy', 'e', 'f', 'g', 'gy', 'h', 'hy', 'i', 'j', + 'k', 'ky', + 'm', 'my', 'n', 'ny', 'o', 'p', 'py', 'r', 'ry', 's', 'sh', 't', 'ts', 'u', 'V', 'w', 'y', 'z'] +ja_symbols = [ + "N", + "a", + "a:", + "b", + "by", + "ch", + "d", + "dy", + "e", + "e:", + "f", + "g", + "gy", + "h", + "hy", + "i", + "i:", + "j", + "k", + "ky", + "m", + "my", + "n", + "ny", + "o", + "o:", + "p", + "py", + "q", + "r", + "ry", + "s", + "sh", + "t", + "ts", + "ty", + "u", + "u:", + "w", + "y", + "z", + "zy", +] num_ja_tones = 1 # English -en_symbols = ['aa', 'ae', 'ah', 'ao', 'aw', 'ay', 'b', 'ch', 'd', 'dh', 'eh', 'er', 'ey', 'f', 'g', 'hh', 'ih', 'iy', - 'jh', 'k', 'l', 'm', 'n', 'ng', 'ow', 'oy', 'p', 'r', 's', - 'sh', 't', 'th', 'uh', 'uw', 'V', 'w', 'y', 'z', 'zh'] +en_symbols = [ + "aa", + "ae", + "ah", + "ao", + "aw", + "ay", + "b", + "ch", + "d", + "dh", + "eh", + "er", + "ey", + "f", + "g", + "hh", + "ih", + "iy", + "jh", + "k", + "l", + "m", + "n", + "ng", + "ow", + "oy", + "p", + "r", + "s", + "sh", + "t", + "th", + "uh", + "uw", + "V", + "w", + "y", + "z", + "zh", +] num_en_tones = 4 -# combine all symbols -normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols)) -symbols = [pad] + normal_symbols + pu_symbols -sil_phonemes_ids = [symbols.index(i) for i in pu_symbols] - -# combine all tones -num_tones = num_zh_tones + num_ja_tones + num_en_tones - -# language maps -language_id_map = { - 'ZH': 0, - "JA": 1, - "EN": 2 -} -num_languages = len(language_id_map.keys()) - -language_tone_start_map = { - 'ZH': 0, - "JA": num_zh_tones, - "EN": num_zh_tones + num_ja_tones -} - -if __name__ == '__main__': - a = set(zh_symbols) - b = set(en_symbols) - print(sorted(a & b)) + +def get_symbols(legacy=False): + if legacy: + ja_symbols = ja_symbols_legacy + # combine all symbols + normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols)) + symbols = [pad] + normal_symbols + pu_symbols + sil_phonemes_ids = [symbols.index(i) for i in pu_symbols] + + # combine all tones + num_tones = num_zh_tones + num_ja_tones + num_en_tones + + # language maps + language_id_map = {"ZH": 0, "JP": 1, "EN": 2} + num_languages = len(language_id_map.keys()) + + language_tone_start_map = { + "ZH": 0, + "JP": num_zh_tones, + "EN": num_zh_tones + num_ja_tones, + } + return symbols, num_tones, language_id_map, num_languages, language_tone_start_map + + +if __name__ == "__main__": + zh = set(zh_symbols) + en = set(en_symbols) + jp = set(ja_symbols) + print(zh) + print(en) + print(jp) + print(sorted(zh & en)) diff --git a/bert_vits2/text/tone_sandhi.py b/bert_vits2/text/tone_sandhi.py index c0a78a5..6a6e4c3 100644 --- a/bert_vits2/text/tone_sandhi.py +++ b/bert_vits2/text/tone_sandhi.py @@ -19,51 +19,442 @@ from pypinyin import Style -class ToneSandhi(): +class ToneSandhi: def __init__(self): self.must_neural_tone_words = { - '麻烦', '麻利', '鸳鸯', '高粱', '骨头', '骆驼', '马虎', '首饰', '馒头', '馄饨', '风筝', - '难为', '队伍', '阔气', '闺女', '门道', '锄头', '铺盖', '铃铛', '铁匠', '钥匙', '里脊', - '里头', '部分', '那么', '道士', '造化', '迷糊', '连累', '这么', '这个', '运气', '过去', - '软和', '转悠', '踏实', '跳蚤', '跟头', '趔趄', '财主', '豆腐', '讲究', '记性', '记号', - '认识', '规矩', '见识', '裁缝', '补丁', '衣裳', '衣服', '衙门', '街坊', '行李', '行当', - '蛤蟆', '蘑菇', '薄荷', '葫芦', '葡萄', '萝卜', '荸荠', '苗条', '苗头', '苍蝇', '芝麻', - '舒服', '舒坦', '舌头', '自在', '膏药', '脾气', '脑袋', '脊梁', '能耐', '胳膊', '胭脂', - '胡萝', '胡琴', '胡同', '聪明', '耽误', '耽搁', '耷拉', '耳朵', '老爷', '老实', '老婆', - '老头', '老太', '翻腾', '罗嗦', '罐头', '编辑', '结实', '红火', '累赘', '糨糊', '糊涂', - '精神', '粮食', '簸箕', '篱笆', '算计', '算盘', '答应', '笤帚', '笑语', '笑话', '窟窿', - '窝囊', '窗户', '稳当', '稀罕', '称呼', '秧歌', '秀气', '秀才', '福气', '祖宗', '砚台', - '码头', '石榴', '石头', '石匠', '知识', '眼睛', '眯缝', '眨巴', '眉毛', '相声', '盘算', - '白净', '痢疾', '痛快', '疟疾', '疙瘩', '疏忽', '畜生', '生意', '甘蔗', '琵琶', '琢磨', - '琉璃', '玻璃', '玫瑰', '玄乎', '狐狸', '状元', '特务', '牲口', '牙碜', '牌楼', '爽快', - '爱人', '热闹', '烧饼', '烟筒', '烂糊', '点心', '炊帚', '灯笼', '火候', '漂亮', '滑溜', - '溜达', '温和', '清楚', '消息', '浪头', '活泼', '比方', '正经', '欺负', '模糊', '槟榔', - '棺材', '棒槌', '棉花', '核桃', '栅栏', '柴火', '架势', '枕头', '枇杷', '机灵', '本事', - '木头', '木匠', '朋友', '月饼', '月亮', '暖和', '明白', '时候', '新鲜', '故事', '收拾', - '收成', '提防', '挖苦', '挑剔', '指甲', '指头', '拾掇', '拳头', '拨弄', '招牌', '招呼', - '抬举', '护士', '折腾', '扫帚', '打量', '打算', '打点', '打扮', '打听', '打发', '扎实', - '扁担', '戒指', '懒得', '意识', '意思', '情形', '悟性', '怪物', '思量', '怎么', '念头', - '念叨', '快活', '忙活', '志气', '心思', '得罪', '张罗', '弟兄', '开通', '应酬', '庄稼', - '干事', '帮手', '帐篷', '希罕', '师父', '师傅', '巴结', '巴掌', '差事', '工夫', '岁数', - '屁股', '尾巴', '少爷', '小气', '小伙', '将就', '对头', '对付', '寡妇', '家伙', '客气', - '实在', '官司', '学问', '学生', '字号', '嫁妆', '媳妇', '媒人', '婆家', '娘家', '委屈', - '姑娘', '姐夫', '妯娌', '妥当', '妖精', '奴才', '女婿', '头发', '太阳', '大爷', '大方', - '大意', '大夫', '多少', '多么', '外甥', '壮实', '地道', '地方', '在乎', '困难', '嘴巴', - '嘱咐', '嘟囔', '嘀咕', '喜欢', '喇嘛', '喇叭', '商量', '唾沫', '哑巴', '哈欠', '哆嗦', - '咳嗽', '和尚', '告诉', '告示', '含糊', '吓唬', '后头', '名字', '名堂', '合同', '吆喝', - '叫唤', '口袋', '厚道', '厉害', '千斤', '包袱', '包涵', '匀称', '勤快', '动静', '动弹', - '功夫', '力气', '前头', '刺猬', '刺激', '别扭', '利落', '利索', '利害', '分析', '出息', - '凑合', '凉快', '冷战', '冤枉', '冒失', '养活', '关系', '先生', '兄弟', '便宜', '使唤', - '佩服', '作坊', '体面', '位置', '似的', '伙计', '休息', '什么', '人家', '亲戚', '亲家', - '交情', '云彩', '事情', '买卖', '主意', '丫头', '丧气', '两口', '东西', '东家', '世故', - '不由', '不在', '下水', '下巴', '上头', '上司', '丈夫', '丈人', '一辈', '那个', '菩萨', - '父亲', '母亲', '咕噜', '邋遢', '费用', '冤家', '甜头', '介绍', '荒唐', '大人', '泥鳅', - '幸福', '熟悉', '计划', '扑腾', '蜡烛', '姥爷', '照顾', '喉咙', '吉他', '弄堂', '蚂蚱', - '凤凰', '拖沓', '寒碜', '糟蹋', '倒腾', '报复', '逻辑', '盘缠', '喽啰', '牢骚', '咖喱', - '扫把', '惦记' + "麻烦", + "麻利", + "鸳鸯", + "高粱", + "骨头", + "骆驼", + "马虎", + "首饰", + "馒头", + "馄饨", + "风筝", + "难为", + "队伍", + "阔气", + "闺女", + "门道", + "锄头", + "铺盖", + "铃铛", + "铁匠", + "钥匙", + "里脊", + "里头", + "部分", + "那么", + "道士", + "造化", + "迷糊", + "连累", + "这么", + "这个", + "运气", + "过去", + "软和", + "转悠", + "踏实", + "跳蚤", + "跟头", + "趔趄", + "财主", + "豆腐", + "讲究", + "记性", + "记号", + "认识", + "规矩", + "见识", + "裁缝", + "补丁", + "衣裳", + "衣服", + "衙门", + "街坊", + "行李", + "行当", + "蛤蟆", + "蘑菇", + "薄荷", + "葫芦", + "葡萄", + "萝卜", + "荸荠", + "苗条", + "苗头", + "苍蝇", + "芝麻", + "舒服", + "舒坦", + "舌头", + "自在", + "膏药", + "脾气", + "脑袋", + "脊梁", + "能耐", + "胳膊", + "胭脂", + "胡萝", + "胡琴", + "胡同", + "聪明", + "耽误", + "耽搁", + "耷拉", + "耳朵", + "老爷", + "老实", + "老婆", + "老头", + "老太", + "翻腾", + "罗嗦", + "罐头", + "编辑", + "结实", + "红火", + "累赘", + "糨糊", + "糊涂", + "精神", + "粮食", + "簸箕", + "篱笆", + "算计", + "算盘", + "答应", + "笤帚", + "笑语", + "笑话", + "窟窿", + "窝囊", + "窗户", + "稳当", + "稀罕", + "称呼", + "秧歌", + "秀气", + "秀才", + "福气", + "祖宗", + "砚台", + "码头", + "石榴", + "石头", + "石匠", + "知识", + "眼睛", + "眯缝", + "眨巴", + "眉毛", + "相声", + "盘算", + "白净", + "痢疾", + "痛快", + "疟疾", + "疙瘩", + "疏忽", + "畜生", + "生意", + "甘蔗", + "琵琶", + "琢磨", + "琉璃", + "玻璃", + "玫瑰", + "玄乎", + "狐狸", + "状元", + "特务", + "牲口", + "牙碜", + "牌楼", + "爽快", + "爱人", + "热闹", + "烧饼", + "烟筒", + "烂糊", + "点心", + "炊帚", + "灯笼", + "火候", + "漂亮", + "滑溜", + "溜达", + "温和", + "清楚", + "消息", + "浪头", + "活泼", + "比方", + "正经", + "欺负", + "模糊", + "槟榔", + "棺材", + "棒槌", + "棉花", + "核桃", + "栅栏", + "柴火", + "架势", + "枕头", + "枇杷", + "机灵", + "本事", + "木头", + "木匠", + "朋友", + "月饼", + "月亮", + "暖和", + "明白", + "时候", + "新鲜", + "故事", + "收拾", + "收成", + "提防", + "挖苦", + "挑剔", + "指甲", + "指头", + "拾掇", + "拳头", + "拨弄", + "招牌", + "招呼", + "抬举", + "护士", + "折腾", + "扫帚", + "打量", + "打算", + "打点", + "打扮", + "打听", + "打发", + "扎实", + "扁担", + "戒指", + "懒得", + "意识", + "意思", + "情形", + "悟性", + "怪物", + "思量", + "怎么", + "念头", + "念叨", + "快活", + "忙活", + "志气", + "心思", + "得罪", + "张罗", + "弟兄", + "开通", + "应酬", + "庄稼", + "干事", + "帮手", + "帐篷", + "希罕", + "师父", + "师傅", + "巴结", + "巴掌", + "差事", + "工夫", + "岁数", + "屁股", + "尾巴", + "少爷", + "小气", + "小伙", + "将就", + "对头", + "对付", + "寡妇", + "家伙", + "客气", + "实在", + "官司", + "学问", + "学生", + "字号", + "嫁妆", + "媳妇", + "媒人", + "婆家", + "娘家", + "委屈", + "姑娘", + "姐夫", + "妯娌", + "妥当", + "妖精", + "奴才", + "女婿", + "头发", + "太阳", + "大爷", + "大方", + "大意", + "大夫", + "多少", + "多么", + "外甥", + "壮实", + "地道", + "地方", + "在乎", + "困难", + "嘴巴", + "嘱咐", + "嘟囔", + "嘀咕", + "喜欢", + "喇嘛", + "喇叭", + "商量", + "唾沫", + "哑巴", + "哈欠", + "哆嗦", + "咳嗽", + "和尚", + "告诉", + "告示", + "含糊", + "吓唬", + "后头", + "名字", + "名堂", + "合同", + "吆喝", + "叫唤", + "口袋", + "厚道", + "厉害", + "千斤", + "包袱", + "包涵", + "匀称", + "勤快", + "动静", + "动弹", + "功夫", + "力气", + "前头", + "刺猬", + "刺激", + "别扭", + "利落", + "利索", + "利害", + "分析", + "出息", + "凑合", + "凉快", + "冷战", + "冤枉", + "冒失", + "养活", + "关系", + "先生", + "兄弟", + "便宜", + "使唤", + "佩服", + "作坊", + "体面", + "位置", + "似的", + "伙计", + "休息", + "什么", + "人家", + "亲戚", + "亲家", + "交情", + "云彩", + "事情", + "买卖", + "主意", + "丫头", + "丧气", + "两口", + "东西", + "东家", + "世故", + "不由", + "不在", + "下水", + "下巴", + "上头", + "上司", + "丈夫", + "丈人", + "一辈", + "那个", + "菩萨", + "父亲", + "母亲", + "咕噜", + "邋遢", + "费用", + "冤家", + "甜头", + "介绍", + "荒唐", + "大人", + "泥鳅", + "幸福", + "熟悉", + "计划", + "扑腾", + "蜡烛", + "姥爷", + "照顾", + "喉咙", + "吉他", + "弄堂", + "蚂蚱", + "凤凰", + "拖沓", + "寒碜", + "糟蹋", + "倒腾", + "报复", + "逻辑", + "盘缠", + "喽啰", + "牢骚", + "咖喱", + "扫把", + "惦记", } self.must_not_neural_tone_words = { - "男子", "女子", "分子", "原子", "量子", "莲子", "石子", "瓜子", "电子", "人人", "虎虎" + "男子", + "女子", + "分子", + "原子", + "量子", + "莲子", + "石子", + "瓜子", + "电子", + "人人", + "虎虎", } self.punc = ":,;。?!“”‘’':,;.?!" @@ -72,14 +463,15 @@ def __init__(self): # word: "家里" # pos: "s" # finals: ['ia1', 'i3'] - def _neural_sandhi(self, word: str, pos: str, - finals: List[str]) -> List[str]: - + def _neural_sandhi(self, word: str, pos: str, finals: List[str]) -> List[str]: # reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺 for j, item in enumerate(word): - if j - 1 >= 0 and item == word[j - 1] and pos[0] in { - "n", "v", "a" - } and word not in self.must_not_neural_tone_words: + if ( + j - 1 >= 0 + and item == word[j - 1] + and pos[0] in {"n", "v", "a"} + and word not in self.must_not_neural_tone_words + ): finals[j] = finals[j][:-1] + "5" ge_idx = word.find("个") if len(word) >= 1 and word[-1] in "吧呢啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶": @@ -89,9 +481,12 @@ def _neural_sandhi(self, word: str, pos: str, # e.g. 走了, 看着, 去过 # elif len(word) == 1 and word in "了着过" and pos in {"ul", "uz", "ug"}: # finals[-1] = finals[-1][:-1] + "5" - elif len(word) > 1 and word[-1] in "们子" and pos in { - "r", "n" - } and word not in self.must_not_neural_tone_words: + elif ( + len(word) > 1 + and word[-1] in "们子" + and pos in {"r", "n"} + and word not in self.must_not_neural_tone_words + ): finals[-1] = finals[-1][:-1] + "5" # e.g. 桌上, 地下, 家里 elif len(word) > 1 and word[-1] in "上下里" and pos in {"s", "l", "f"}: @@ -100,21 +495,26 @@ def _neural_sandhi(self, word: str, pos: str, elif len(word) > 1 and word[-1] in "来去" and word[-2] in "上下进出回过起开": finals[-1] = finals[-1][:-1] + "5" # 个做量词 - elif (ge_idx >= 1 and - (word[ge_idx - 1].isnumeric() or - word[ge_idx - 1] in "几有两半多各整每做是")) or word == '个': + elif ( + ge_idx >= 1 + and (word[ge_idx - 1].isnumeric() or word[ge_idx - 1] in "几有两半多各整每做是") + ) or word == "个": finals[ge_idx] = finals[ge_idx][:-1] + "5" else: - if word in self.must_neural_tone_words or word[ - -2:] in self.must_neural_tone_words: + if ( + word in self.must_neural_tone_words + or word[-2:] in self.must_neural_tone_words + ): finals[-1] = finals[-1][:-1] + "5" word_list = self._split_word(word) - finals_list = [finals[:len(word_list[0])], finals[len(word_list[0]):]] + finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]] for i, word in enumerate(word_list): # conventional neural in Chinese - if word in self.must_neural_tone_words or word[ - -2:] in self.must_neural_tone_words: + if ( + word in self.must_neural_tone_words + or word[-2:] in self.must_neural_tone_words + ): finals_list[i][-1] = finals_list[i][-1][:-1] + "5" finals = sum(finals_list, []) return finals @@ -126,17 +526,17 @@ def _bu_sandhi(self, word: str, finals: List[str]) -> List[str]: else: for i, char in enumerate(word): # "不" before tone4 should be bu2, e.g. 不怕 - if char == "不" and i + 1 < len(word) and finals[i + - 1][-1] == "4": + if char == "不" and i + 1 < len(word) and finals[i + 1][-1] == "4": finals[i] = finals[i][:-1] + "2" return finals def _yi_sandhi(self, word: str, finals: List[str]) -> List[str]: # "一" in number sequences, e.g. 一零零, 二一零 if word.find("一") != -1 and all( - [item.isnumeric() for item in word if item != "一"]): + [item.isnumeric() for item in word if item != "一"] + ): return finals - # "一" between reduplication words shold be yi5, e.g. 看一看 + # "一" between reduplication words should be yi5, e.g. 看一看 elif len(word) == 3 and word[1] == "一" and word[0] == word[-1]: finals[1] = finals[1][:-1] + "5" # when "一" is ordinal word, it should be yi1 @@ -161,10 +561,10 @@ def _split_word(self, word: str) -> List[str]: first_subword = word_list[0] first_begin_idx = word.find(first_subword) if first_begin_idx == 0: - second_subword = word[len(first_subword):] + second_subword = word[len(first_subword) :] new_word_list = [first_subword, second_subword] else: - second_subword = word[:-len(first_subword)] + second_subword = word[: -len(first_subword)] new_word_list = [second_subword, first_subword] return new_word_list @@ -182,18 +582,19 @@ def _three_sandhi(self, word: str, finals: List[str]) -> List[str]: elif len(word_list[0]) == 1: finals[1] = finals[1][:-1] + "2" else: - finals_list = [ - finals[:len(word_list[0])], finals[len(word_list[0]):] - ] + finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]] if len(finals_list) == 2: for i, sub in enumerate(finals_list): # e.g. 所有/人 if self._all_tone_three(sub) and len(sub) == 2: finals_list[i][0] = finals_list[i][0][:-1] + "2" # e.g. 好/喜欢 - elif i == 1 and not self._all_tone_three(sub) and finals_list[i][0][-1] == "3" and \ - finals_list[0][-1][-1] == "3": - + elif ( + i == 1 + and not self._all_tone_three(sub) + and finals_list[i][0][-1] == "3" + and finals_list[0][-1][-1] == "3" + ): finals_list[0][-1] = finals_list[0][-1][:-1] + "2" finals = sum(finals_list, []) # split idiom into two words who's length is 2 @@ -222,7 +623,7 @@ def _merge_bu(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: new_seg.append((word, pos)) last_word = word[:] if last_word == "不": - new_seg.append((last_word, 'd')) + new_seg.append((last_word, "d")) last_word = "" return new_seg @@ -236,12 +637,21 @@ def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: new_seg = [] # function 1 for i, (word, pos) in enumerate(seg): - if i - 1 >= 0 and word == "一" and i + 1 < len(seg) and seg[i - 1][ - 0] == seg[i + 1][0] and seg[i - 1][1] == "v": + if ( + i - 1 >= 0 + and word == "一" + and i + 1 < len(seg) + and seg[i - 1][0] == seg[i + 1][0] + and seg[i - 1][1] == "v" + ): new_seg[i - 1][0] = new_seg[i - 1][0] + "一" + new_seg[i - 1][0] else: - if i - 2 >= 0 and seg[i - 1][0] == "一" and seg[i - 2][ - 0] == word and pos == "v": + if ( + i - 2 >= 0 + and seg[i - 1][0] == "一" + and seg[i - 2][0] == word + and pos == "v" + ): continue else: new_seg.append([word, pos]) @@ -257,22 +667,27 @@ def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: # the first and the second words are all_tone_three def _merge_continuous_three_tones( - self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: + self, seg: List[Tuple[str, str]] + ) -> List[Tuple[str, str]]: new_seg = [] sub_finals_list = [ - lazy_pinyin( - word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) + lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) for (word, pos) in seg ] assert len(sub_finals_list) == len(seg) merge_last = [False] * len(seg) for i, (word, pos) in enumerate(seg): - if i - 1 >= 0 and self._all_tone_three( - sub_finals_list[i - 1]) and self._all_tone_three( - sub_finals_list[i]) and not merge_last[i - 1]: + if ( + i - 1 >= 0 + and self._all_tone_three(sub_finals_list[i - 1]) + and self._all_tone_three(sub_finals_list[i]) + and not merge_last[i - 1] + ): # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi - if not self._is_reduplication(seg[i - 1][0]) and len( - seg[i - 1][0]) + len(seg[i][0]) <= 3: + if ( + not self._is_reduplication(seg[i - 1][0]) + and len(seg[i - 1][0]) + len(seg[i][0]) <= 3 + ): new_seg[-1][0] = new_seg[-1][0] + seg[i][0] merge_last[i] = True else: @@ -287,21 +702,27 @@ def _is_reduplication(self, word: str) -> bool: # the last char of first word and the first char of second word is tone_three def _merge_continuous_three_tones_2( - self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: + self, seg: List[Tuple[str, str]] + ) -> List[Tuple[str, str]]: new_seg = [] sub_finals_list = [ - lazy_pinyin( - word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) + lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) for (word, pos) in seg ] assert len(sub_finals_list) == len(seg) merge_last = [False] * len(seg) for i, (word, pos) in enumerate(seg): - if i - 1 >= 0 and sub_finals_list[i - 1][-1][-1] == "3" and sub_finals_list[i][0][-1] == "3" and not \ - merge_last[i - 1]: + if ( + i - 1 >= 0 + and sub_finals_list[i - 1][-1][-1] == "3" + and sub_finals_list[i][0][-1] == "3" + and not merge_last[i - 1] + ): # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi - if not self._is_reduplication(seg[i - 1][0]) and len( - seg[i - 1][0]) + len(seg[i][0]) <= 3: + if ( + not self._is_reduplication(seg[i - 1][0]) + and len(seg[i - 1][0]) + len(seg[i][0]) <= 3 + ): new_seg[-1][0] = new_seg[-1][0] + seg[i][0] merge_last[i] = True else: @@ -319,8 +740,7 @@ def _merge_er(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: new_seg.append([word, pos]) return new_seg - def _merge_reduplication( - self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: + def _merge_reduplication(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: new_seg = [] for i, (word, pos) in enumerate(seg): if new_seg and word == new_seg[-1][0]: @@ -329,8 +749,7 @@ def _merge_reduplication( new_seg.append([word, pos]) return new_seg - def pre_merge_for_modify( - self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: + def pre_merge_for_modify(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: seg = self._merge_bu(seg) try: seg = self._merge_yi(seg) @@ -342,8 +761,7 @@ def pre_merge_for_modify( seg = self._merge_er(seg) return seg - def modified_tone(self, word: str, pos: str, - finals: List[str]) -> List[str]: + def modified_tone(self, word: str, pos: str, finals: List[str]) -> List[str]: finals = self._bu_sandhi(word, finals) finals = self._yi_sandhi(word, finals) finals = self._neural_sandhi(word, pos, finals) diff --git a/config.py b/config.py index 252040a..5ef6a73 100644 --- a/config.py +++ b/config.py @@ -1,6 +1,8 @@ import os import sys +import torch + JSON_AS_ASCII = False MAX_CONTENT_LENGTH = 5242880 @@ -79,6 +81,8 @@ # w2v2-vits: Need to have both `model.onnx` and `model.yaml` files in the same path. # DIMENSIONAL_EMOTION_MODEL = ABS_PATH + "/Model/model.yaml" +DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") + """ Default parameter """ diff --git a/requirements.txt b/requirements.txt index b425cfc..efbbe65 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,4 +28,5 @@ fastlid langid phonemizer==3.2.1 transformers -pydantic==2.3.0 \ No newline at end of file +pydantic==2.3.0 +num2words \ No newline at end of file diff --git a/utils/merge.py b/utils/load_model.py similarity index 100% rename from utils/merge.py rename to utils/load_model.py diff --git a/utils/nlp.py b/utils/sentence.py similarity index 100% rename from utils/nlp.py rename to utils/sentence.py diff --git a/vits/vits.py b/vits/vits.py index 21c06c3..cc7cd3d 100644 --- a/vits/vits.py +++ b/vits/vits.py @@ -4,7 +4,7 @@ import numpy as np import torch from torch import no_grad, LongTensor, inference_mode, FloatTensor -from utils.nlp import sentence_split +from utils.sentence import sentence_split from vits.mel_processing import spectrogram_torch from vits.text import text_to_sequence from vits.models import SynthesizerTrn