diff --git a/app.py b/app.py
index 5dd7547..d3d7167 100644
--- a/app.py
+++ b/app.py
@@ -7,7 +7,7 @@
 from flask_apscheduler import APScheduler
 from functools import wraps
 from utils.utils import clean_folder, check_is_none
-from utils.merge import merge_model
+from utils.load_model import merge_model
 from io import BytesIO
 
 app = Flask(__name__)
diff --git a/bert_vits2/bert_vits2.py b/bert_vits2/bert_vits2.py
index 9811360..9e37c1b 100644
--- a/bert_vits2/bert_vits2.py
+++ b/bert_vits2/bert_vits2.py
@@ -7,7 +7,8 @@
 from bert_vits2.models import SynthesizerTrn
 from bert_vits2.text import symbols, cleaned_text_to_sequence, get_bert
 from bert_vits2.text.cleaner import clean_text
-from utils.nlp import sentence_split, cut
+from bert_vits2.text.symbols import get_symbols
+from utils.sentence import sentence_split, cut
 
 
 class Bert_VITS2:
@@ -16,11 +17,20 @@ def __init__(self, model, config, device=torch.device("cpu")):
         self.n_speakers = getattr(self.hps_ms.data, 'n_speakers', 0)
         self.speakers = [item[0] for item in
                          sorted(list(getattr(self.hps_ms.data, 'spk2id', {'0': 0}).items()), key=lambda x: x[1])]
+        
+        self.legacy = getattr(self.hps_ms.data, 'legacy', False)
+        symbols, num_tones, self.language_id_map, num_languages, self.language_tone_start_map = get_symbols(
+            legacy=self.legacy)
+        self._symbol_to_id = {s: i for i, s in enumerate(symbols)}
+
         self.net_g = SynthesizerTrn(
             len(symbols),
             self.hps_ms.data.filter_length // 2 + 1,
             self.hps_ms.train.segment_size // self.hps_ms.data.hop_length,
             n_speakers=self.hps_ms.data.n_speakers,
+            symbols=symbols,
+            num_tones=num_tones,
+            num_languages=num_languages,
             **self.hps_ms.model).to(device)
         _ = self.net_g.eval()
         self.device = device
@@ -35,7 +45,8 @@ def get_speakers(self):
     def get_text(self, text, language_str, hps):
         norm_text, phone, tone, word2ph = clean_text(text, language_str)
         # print([f"{p}{t}" for p, t in zip(phone, tone)])
-        phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
+        phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str, self._symbol_to_id,
+                                                         self.language_tone_start_map, self.language_id_map)
 
         if hps.data.add_blank:
             phone = commons.intersperse(phone, 0)
diff --git a/bert_vits2/models.py b/bert_vits2/models.py
index ce25763..f8f01cd 100644
--- a/bert_vits2/models.py
+++ b/bert_vits2/models.py
@@ -11,7 +11,6 @@
 from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
 
 from bert_vits2.commons import init_weights, get_padding
-from bert_vits2.text import symbols, num_tones, num_languages
 
 
 class DurationDiscriminator(nn.Module):  # vits2
@@ -254,7 +253,10 @@ def __init__(self,
                  n_layers,
                  kernel_size,
                  p_dropout,
-                 gin_channels=0):
+                 gin_channels=0,
+                 symbols=None,
+                 num_tones=None,
+                 num_languages=None):
         super().__init__()
         self.n_vocab = n_vocab
         self.out_channels = out_channels
@@ -620,6 +622,9 @@ def __init__(self,
         self.current_mas_noise_scale = self.mas_noise_scale_initial
         if self.use_spk_conditioned_encoder and gin_channels > 0:
             self.enc_gin_channels = gin_channels
+        symbols = kwargs.get("symbols")
+        num_tones = kwargs.get("num_tones")
+        num_languages = kwargs.get("num_languages")
         self.enc_p = TextEncoder(n_vocab,
                                  inter_channels,
                                  hidden_channels,
@@ -628,7 +633,11 @@ def __init__(self,
                                  n_layers,
                                  kernel_size,
                                  p_dropout,
-                                 gin_channels=self.enc_gin_channels)
+                                 gin_channels=self.enc_gin_channels,
+                                 symbols=symbols,
+                                 num_tones=num_tones,
+                                 num_languages=num_languages
+                                 )
         self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates,
                              upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
         self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16,
diff --git a/bert_vits2/text/__init__.py b/bert_vits2/text/__init__.py
index 3cf3a71..7d81c25 100644
--- a/bert_vits2/text/__init__.py
+++ b/bert_vits2/text/__init__.py
@@ -1,17 +1,12 @@
-from bert_vits2.text.symbols import *
-from .chinese_bert import get_bert_feature as zh_bert
-from .english_bert_mock import get_bert_feature as en_bert
+from bert_vits2.text.symbols import punctuation
 
-_symbol_to_id = {s: i for i, s in enumerate(symbols)}
-
-
-def cleaned_text_to_sequence(cleaned_text, tones, language):
-    '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
-      Args:
-        text: string to convert to a sequence
-      Returns:
-        List of integers corresponding to the symbols in the text
-    '''
+def cleaned_text_to_sequence(cleaned_text, tones, language, _symbol_to_id, language_tone_start_map, language_id_map):
+    """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
+    Args:
+      text: string to convert to a sequence
+    Returns:
+      List of integers corresponding to the symbols in the text
+    """
     phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
     tone_start = language_tone_start_map[language]
     tones = [i + tone_start for i in tones]
@@ -21,9 +16,15 @@ def cleaned_text_to_sequence(cleaned_text, tones, language):
 
 
 def get_bert(norm_text, word2ph, language):
-    lang_bert_func_map = {
-        'ZH': zh_bert,
-        'EN': en_bert
-    }
-    bert = lang_bert_func_map[language](norm_text, word2ph)
+    if language == "ZH":
+        from .chinese_bert import get_bert_feature as zh_bert
+        lang_bert_func = zh_bert
+    elif language == "EN":
+        from .english_bert_mock import get_bert_feature as en_bert
+        lang_bert_func = en_bert
+    elif language == "JP":
+        from .japanese_bert import get_bert_feature as jp_bert
+        lang_bert_func = jp_bert
+
+    bert = lang_bert_func(norm_text, word2ph)
     return bert
diff --git a/bert_vits2/text/chinese_bert.py b/bert_vits2/text/chinese_bert.py
index 3079c47..dae88a1 100644
--- a/bert_vits2/text/chinese_bert.py
+++ b/bert_vits2/text/chinese_bert.py
@@ -3,20 +3,18 @@
 from transformers import AutoTokenizer, AutoModelForMaskedLM
 from logger import logger
 
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
 try:
     logger.info("Loading chinese-roberta-wwm-ext-large...")
     tokenizer = AutoTokenizer.from_pretrained(config.ABS_PATH + "/bert_vits2/bert/chinese-roberta-wwm-ext-large")
     model = AutoModelForMaskedLM.from_pretrained(config.ABS_PATH + "/bert_vits2/bert/chinese-roberta-wwm-ext-large").to(
-        device)
+        config.DEVICE)
     logger.info("Loading finished.")
 except Exception as e:
     logger.error(e)
-    logger.error(f"Please download model from hfl/chinese-roberta-wwm-ext-large.")
+    logger.error(f"Please download pytorch_model.bin from hfl/chinese-roberta-wwm-ext-large.")
 
 
-def get_bert_feature(text, word2ph):
+def get_bert_feature(text, word2ph, device=config.DEVICE):
     with torch.no_grad():
         inputs = tokenizer(text, return_tensors='pt')
         for i in inputs:
@@ -37,7 +35,6 @@ def get_bert_feature(text, word2ph):
 
 
 if __name__ == '__main__':
-    # feature = get_bert_feature('你好,我是说的道理。')
     import torch
 
     word_level_feature = torch.rand(38, 1024)  # 12个词,每个词1024维特征
diff --git a/bert_vits2/text/japanese.py b/bert_vits2/text/japanese.py
index d92633e..7f388e2 100644
--- a/bert_vits2/text/japanese.py
+++ b/bert_vits2/text/japanese.py
@@ -1,104 +1,584 @@
-# modified from https://github.com/CjangCjengh/vits/blob/main/text/japanese.py
+# Convert Japanese text to phonemes which is
+# compatible with Julius https://github.com/julius-speech/segmentation-kit
 import re
-import sys
-
-import pyopenjtalk
-
-from bert_vits2.text import symbols
-
-# Regular expression matching Japanese without punctuation marks:
-_japanese_characters = re.compile(
-    r'[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
-
-# Regular expression matching non-Japanese characters or punctuation marks:
-_japanese_marks = re.compile(
-    r'[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
-
-# List of (symbol, Japanese) pairs for marks:
-_symbols_to_japanese = [(re.compile('%s' % x[0]), x[1]) for x in [
-    ('％', 'パーセント')
-]]
-
-# List of (consonant, sokuon) pairs:
-_real_sokuon = [(re.compile('%s' % x[0]), x[1]) for x in [
-    (r'Q([↑↓]*[kg])', r'k#\1'),
-    (r'Q([↑↓]*[tdjʧ])', r't#\1'),
-    (r'Q([↑↓]*[sʃ])', r's\1'),
-    (r'Q([↑↓]*[pb])', r'p#\1')
-]]
-
-# List of (consonant, hatsuon) pairs:
-_real_hatsuon = [(re.compile('%s' % x[0]), x[1]) for x in [
-    (r'N([↑↓]*[pbm])', r'm\1'),
-    (r'N([↑↓]*[ʧʥj])', r'n^\1'),
-    (r'N([↑↓]*[tdn])', r'n\1'),
-    (r'N([↑↓]*[kg])', r'ŋ\1')
-]]
-
-
-def post_replace_ph(ph):
-    rep_map = {
-        '：': ',',
-        '；': ',',
-        '，': ',',
-        '。': '.',
-        '！': '!',
-        '？': '?',
-        '\n': '.',
-        "·": ",",
-        '、': ",",
-        '...': '…',
-        'v': "V"
-    }
-    if ph in rep_map.keys():
-        ph = rep_map[ph]
-    if ph in symbols:
-        return ph
-    if ph not in symbols:
-        ph = 'UNK'
-    return ph
-
-
-def symbols_to_japanese(text):
-    for regex, replacement in _symbols_to_japanese:
-        text = re.sub(regex, replacement, text)
-    return text
-
-
-def preprocess_jap(text):
-    '''Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html'''
-    text = symbols_to_japanese(text)
-    sentences = re.split(_japanese_marks, text)
-    marks = re.findall(_japanese_marks, text)
-    text = []
-    for i, sentence in enumerate(sentences):
-        if re.match(_japanese_characters, sentence):
-            p = pyopenjtalk.g2p(sentence)
-            text += p.split(" ")
-
-        if i < len(marks):
-            text += [marks[i].replace(' ', '')]
-    return text
+import unicodedata
+
+from transformers import AutoTokenizer
+
+from bert_vits2.text import punctuation, symbols
+from bert_vits2.text.japanese_bert import tokenizer
+
+try:
+    import MeCab
+except ImportError as e:
+    raise ImportError("Japanese requires mecab-python3 and unidic-lite.") from e
+from num2words import num2words
+
+_CONVRULES = [
+    # Conversion of 2 letters
+    "アァ/ a a",
+    "イィ/ i i",
+    "イェ/ i e",
+    "イャ/ y a",
+    "ウゥ/ u:",
+    "エェ/ e e",
+    "オォ/ o:",
+    "カァ/ k a:",
+    "キィ/ k i:",
+    "クゥ/ k u:",
+    "クャ/ ky a",
+    "クュ/ ky u",
+    "クョ/ ky o",
+    "ケェ/ k e:",
+    "コォ/ k o:",
+    "ガァ/ g a:",
+    "ギィ/ g i:",
+    "グゥ/ g u:",
+    "グャ/ gy a",
+    "グュ/ gy u",
+    "グョ/ gy o",
+    "ゲェ/ g e:",
+    "ゴォ/ g o:",
+    "サァ/ s a:",
+    "シィ/ sh i:",
+    "スゥ/ s u:",
+    "スャ/ sh a",
+    "スュ/ sh u",
+    "スョ/ sh o",
+    "セェ/ s e:",
+    "ソォ/ s o:",
+    "ザァ/ z a:",
+    "ジィ/ j i:",
+    "ズゥ/ z u:",
+    "ズャ/ zy a",
+    "ズュ/ zy u",
+    "ズョ/ zy o",
+    "ゼェ/ z e:",
+    "ゾォ/ z o:",
+    "タァ/ t a:",
+    "チィ/ ch i:",
+    "ツァ/ ts a",
+    "ツィ/ ts i",
+    "ツゥ/ ts u:",
+    "ツャ/ ch a",
+    "ツュ/ ch u",
+    "ツョ/ ch o",
+    "ツェ/ ts e",
+    "ツォ/ ts o",
+    "テェ/ t e:",
+    "トォ/ t o:",
+    "ダァ/ d a:",
+    "ヂィ/ j i:",
+    "ヅゥ/ d u:",
+    "ヅャ/ zy a",
+    "ヅュ/ zy u",
+    "ヅョ/ zy o",
+    "デェ/ d e:",
+    "ドォ/ d o:",
+    "ナァ/ n a:",
+    "ニィ/ n i:",
+    "ヌゥ/ n u:",
+    "ヌャ/ ny a",
+    "ヌュ/ ny u",
+    "ヌョ/ ny o",
+    "ネェ/ n e:",
+    "ノォ/ n o:",
+    "ハァ/ h a:",
+    "ヒィ/ h i:",
+    "フゥ/ f u:",
+    "フャ/ hy a",
+    "フュ/ hy u",
+    "フョ/ hy o",
+    "ヘェ/ h e:",
+    "ホォ/ h o:",
+    "バァ/ b a:",
+    "ビィ/ b i:",
+    "ブゥ/ b u:",
+    "フャ/ hy a",
+    "ブュ/ by u",
+    "フョ/ hy o",
+    "ベェ/ b e:",
+    "ボォ/ b o:",
+    "パァ/ p a:",
+    "ピィ/ p i:",
+    "プゥ/ p u:",
+    "プャ/ py a",
+    "プュ/ py u",
+    "プョ/ py o",
+    "ペェ/ p e:",
+    "ポォ/ p o:",
+    "マァ/ m a:",
+    "ミィ/ m i:",
+    "ムゥ/ m u:",
+    "ムャ/ my a",
+    "ムュ/ my u",
+    "ムョ/ my o",
+    "メェ/ m e:",
+    "モォ/ m o:",
+    "ヤァ/ y a:",
+    "ユゥ/ y u:",
+    "ユャ/ y a:",
+    "ユュ/ y u:",
+    "ユョ/ y o:",
+    "ヨォ/ y o:",
+    "ラァ/ r a:",
+    "リィ/ r i:",
+    "ルゥ/ r u:",
+    "ルャ/ ry a",
+    "ルュ/ ry u",
+    "ルョ/ ry o",
+    "レェ/ r e:",
+    "ロォ/ r o:",
+    "ワァ/ w a:",
+    "ヲォ/ o:",
+    "ディ/ d i",
+    "デェ/ d e:",
+    "デャ/ dy a",
+    "デュ/ dy u",
+    "デョ/ dy o",
+    "ティ/ t i",
+    "テェ/ t e:",
+    "テャ/ ty a",
+    "テュ/ ty u",
+    "テョ/ ty o",
+    "スィ/ s i",
+    "ズァ/ z u a",
+    "ズィ/ z i",
+    "ズゥ/ z u",
+    "ズャ/ zy a",
+    "ズュ/ zy u",
+    "ズョ/ zy o",
+    "ズェ/ z e",
+    "ズォ/ z o",
+    "キャ/ ky a",
+    "キュ/ ky u",
+    "キョ/ ky o",
+    "シャ/ sh a",
+    "シュ/ sh u",
+    "シェ/ sh e",
+    "ショ/ sh o",
+    "チャ/ ch a",
+    "チュ/ ch u",
+    "チェ/ ch e",
+    "チョ/ ch o",
+    "トゥ/ t u",
+    "トャ/ ty a",
+    "トュ/ ty u",
+    "トョ/ ty o",
+    "ドァ/ d o a",
+    "ドゥ/ d u",
+    "ドャ/ dy a",
+    "ドュ/ dy u",
+    "ドョ/ dy o",
+    "ドォ/ d o:",
+    "ニャ/ ny a",
+    "ニュ/ ny u",
+    "ニョ/ ny o",
+    "ヒャ/ hy a",
+    "ヒュ/ hy u",
+    "ヒョ/ hy o",
+    "ミャ/ my a",
+    "ミュ/ my u",
+    "ミョ/ my o",
+    "リャ/ ry a",
+    "リュ/ ry u",
+    "リョ/ ry o",
+    "ギャ/ gy a",
+    "ギュ/ gy u",
+    "ギョ/ gy o",
+    "ヂェ/ j e",
+    "ヂャ/ j a",
+    "ヂュ/ j u",
+    "ヂョ/ j o",
+    "ジェ/ j e",
+    "ジャ/ j a",
+    "ジュ/ j u",
+    "ジョ/ j o",
+    "ビャ/ by a",
+    "ビュ/ by u",
+    "ビョ/ by o",
+    "ピャ/ py a",
+    "ピュ/ py u",
+    "ピョ/ py o",
+    "ウァ/ u a",
+    "ウィ/ w i",
+    "ウェ/ w e",
+    "ウォ/ w o",
+    "ファ/ f a",
+    "フィ/ f i",
+    "フゥ/ f u",
+    "フャ/ hy a",
+    "フュ/ hy u",
+    "フョ/ hy o",
+    "フェ/ f e",
+    "フォ/ f o",
+    "ヴァ/ b a",
+    "ヴィ/ b i",
+    "ヴェ/ b e",
+    "ヴォ/ b o",
+    "ヴュ/ by u",
+    # Conversion of 1 letter
+    "ア/ a",
+    "イ/ i",
+    "ウ/ u",
+    "エ/ e",
+    "オ/ o",
+    "カ/ k a",
+    "キ/ k i",
+    "ク/ k u",
+    "ケ/ k e",
+    "コ/ k o",
+    "サ/ s a",
+    "シ/ sh i",
+    "ス/ s u",
+    "セ/ s e",
+    "ソ/ s o",
+    "タ/ t a",
+    "チ/ ch i",
+    "ツ/ ts u",
+    "テ/ t e",
+    "ト/ t o",
+    "ナ/ n a",
+    "ニ/ n i",
+    "ヌ/ n u",
+    "ネ/ n e",
+    "ノ/ n o",
+    "ハ/ h a",
+    "ヒ/ h i",
+    "フ/ f u",
+    "ヘ/ h e",
+    "ホ/ h o",
+    "マ/ m a",
+    "ミ/ m i",
+    "ム/ m u",
+    "メ/ m e",
+    "モ/ m o",
+    "ラ/ r a",
+    "リ/ r i",
+    "ル/ r u",
+    "レ/ r e",
+    "ロ/ r o",
+    "ガ/ g a",
+    "ギ/ g i",
+    "グ/ g u",
+    "ゲ/ g e",
+    "ゴ/ g o",
+    "ザ/ z a",
+    "ジ/ j i",
+    "ズ/ z u",
+    "ゼ/ z e",
+    "ゾ/ z o",
+    "ダ/ d a",
+    "ヂ/ j i",
+    "ヅ/ z u",
+    "デ/ d e",
+    "ド/ d o",
+    "バ/ b a",
+    "ビ/ b i",
+    "ブ/ b u",
+    "ベ/ b e",
+    "ボ/ b o",
+    "パ/ p a",
+    "ピ/ p i",
+    "プ/ p u",
+    "ペ/ p e",
+    "ポ/ p o",
+    "ヤ/ y a",
+    "ユ/ y u",
+    "ヨ/ y o",
+    "ワ/ w a",
+    "ヰ/ i",
+    "ヱ/ e",
+    "ヲ/ o",
+    "ン/ N",
+    "ッ/ q",
+    "ヴ/ b u",
+    "ー/:",
+    # Try converting broken text
+    "ァ/ a",
+    "ィ/ i",
+    "ゥ/ u",
+    "ェ/ e",
+    "ォ/ o",
+    "ヮ/ w a",
+    "ォ/ o",
+    # Symbols
+    "、/ ,",
+    "。/ .",
+    "！/ !",
+    "？/ ?",
+    "・/ ,",
+]
+
+_COLON_RX = re.compile(":+")
+_REJECT_RX = re.compile("[^ a-zA-Z:,.?]")
+
+
+def _makerulemap():
+    l = [tuple(x.split("/")) for x in _CONVRULES]
+    return tuple({k: v for k, v in l if len(k) == i} for i in (1, 2))
+
+
+_RULEMAP1, _RULEMAP2 = _makerulemap()
+
+
+def kata2phoneme(text: str) -> str:
+    """Convert katakana text to phonemes."""
+    text = text.strip()
+    res = []
+    while text:
+        if len(text) >= 2:
+            x = _RULEMAP2.get(text[:2])
+            if x is not None:
+                text = text[2:]
+                res += x.split(" ")[1:]
+                continue
+        x = _RULEMAP1.get(text[0])
+        if x is not None:
+            text = text[1:]
+            res += x.split(" ")[1:]
+            continue
+        res.append(text[0])
+        text = text[1:]
+    # res = _COLON_RX.sub(":", res)
+    return res
+
+
+_KATAKANA = "".join(chr(ch) for ch in range(ord("ァ"), ord("ン") + 1))
+_HIRAGANA = "".join(chr(ch) for ch in range(ord("ぁ"), ord("ん") + 1))
+_HIRA2KATATRANS = str.maketrans(_HIRAGANA, _KATAKANA)
+
+
+def hira2kata(text: str) -> str:
+    text = text.translate(_HIRA2KATATRANS)
+    return text.replace("う゛", "ヴ")
+
+
+_SYMBOL_TOKENS = set(list("・、。？！"))
+_NO_YOMI_TOKENS = set(list("「」『』―（）［］[]"))
+_TAGGER = MeCab.Tagger()
+
+
+def text2kata(text: str) -> str:
+    parsed = _TAGGER.parse(text)
+    res = []
+    for line in parsed.split("\n"):
+        if line == "EOS":
+            break
+        parts = line.split("\t")
+
+        word, yomi = parts[0], parts[1]
+        if yomi:
+            res.append(yomi)
+        else:
+            if word in _SYMBOL_TOKENS:
+                res.append(word)
+            elif word in ("っ", "ッ"):
+                res.append("ッ")
+            elif word in _NO_YOMI_TOKENS:
+                pass
+            else:
+                res.append(word)
+    return hira2kata("".join(res))
+
+
+_ALPHASYMBOL_YOMI = {
+    "#": "シャープ",
+    "%": "パーセント",
+    "&": "アンド",
+    "+": "プラス",
+    "-": "マイナス",
+    ":": "コロン",
+    ";": "セミコロン",
+    "<": "小なり",
+    "=": "イコール",
+    ">": "大なり",
+    "@": "アット",
+    "a": "エー",
+    "b": "ビー",
+    "c": "シー",
+    "d": "ディー",
+    "e": "イー",
+    "f": "エフ",
+    "g": "ジー",
+    "h": "エイチ",
+    "i": "アイ",
+    "j": "ジェー",
+    "k": "ケー",
+    "l": "エル",
+    "m": "エム",
+    "n": "エヌ",
+    "o": "オー",
+    "p": "ピー",
+    "q": "キュー",
+    "r": "アール",
+    "s": "エス",
+    "t": "ティー",
+    "u": "ユー",
+    "v": "ブイ",
+    "w": "ダブリュー",
+    "x": "エックス",
+    "y": "ワイ",
+    "z": "ゼット",
+    "α": "アルファ",
+    "β": "ベータ",
+    "γ": "ガンマ",
+    "δ": "デルタ",
+    "ε": "イプシロン",
+    "ζ": "ゼータ",
+    "η": "イータ",
+    "θ": "シータ",
+    "ι": "イオタ",
+    "κ": "カッパ",
+    "λ": "ラムダ",
+    "μ": "ミュー",
+    "ν": "ニュー",
+    "ξ": "クサイ",
+    "ο": "オミクロン",
+    "π": "パイ",
+    "ρ": "ロー",
+    "σ": "シグマ",
+    "τ": "タウ",
+    "υ": "ウプシロン",
+    "φ": "ファイ",
+    "χ": "カイ",
+    "ψ": "プサイ",
+    "ω": "オメガ",
+}
+
+
+_NUMBER_WITH_SEPARATOR_RX = re.compile("[0-9]{1,3}(,[0-9]{3})+")
+_CURRENCY_MAP = {"$": "ドル", "¥": "円", "£": "ポンド", "€": "ユーロ"}
+_CURRENCY_RX = re.compile(r"([$¥£€])([0-9.]*[0-9])")
+_NUMBER_RX = re.compile(r"[0-9]+(\.[0-9]+)?")
+
+
+def japanese_convert_numbers_to_words(text: str) -> str:
+    res = _NUMBER_WITH_SEPARATOR_RX.sub(lambda m: m[0].replace(",", ""), text)
+    res = _CURRENCY_RX.sub(lambda m: m[2] + _CURRENCY_MAP.get(m[1], m[1]), res)
+    res = _NUMBER_RX.sub(lambda m: num2words(m[0], lang="ja"), res)
+    return res
+
+
+def japanese_convert_alpha_symbols_to_words(text: str) -> str:
+    return "".join([_ALPHASYMBOL_YOMI.get(ch, ch) for ch in text.lower()])
+
+
+def japanese_text_to_phonemes(text: str) -> str:
+    """Convert Japanese text to phonemes."""
+    res = unicodedata.normalize("NFKC", text)
+    res = japanese_convert_numbers_to_words(res)
+    # res = japanese_convert_alpha_symbols_to_words(res)
+    res = text2kata(res)
+    res = kata2phoneme(res)
+    return res
+
+
+def is_japanese_character(char):
+    # 定义日语文字系统的 Unicode 范围
+    japanese_ranges = [
+        (0x3040, 0x309F),  # 平假名
+        (0x30A0, 0x30FF),  # 片假名
+        (0x4E00, 0x9FFF),  # 汉字 (CJK Unified Ideographs)
+        (0x3400, 0x4DBF),  # 汉字扩展 A
+        (0x20000, 0x2A6DF),  # 汉字扩展 B
+        # 可以根据需要添加其他汉字扩展范围
+    ]
+
+    # 将字符的 Unicode 编码转换为整数
+    char_code = ord(char)
+
+    # 检查字符是否在任何一个日语范围内
+    for start, end in japanese_ranges:
+        if start <= char_code <= end:
+            return True
+
+    return False
+
+
+rep_map = {
+    "：": ",",
+    "；": ",",
+    "，": ",",
+    "。": ".",
+    "！": "!",
+    "？": "?",
+    "\n": ".",
+    "·": ",",
+    "、": ",",
+    "...": "…",
+}
+
+
+def replace_punctuation(text):
+    pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
+
+    replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
+
+    replaced_text = re.sub(
+        r"[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF"
+        + "".join(punctuation)
+        + r"]+",
+        "",
+        replaced_text,
+    )
+
+    return replaced_text
 
 
 def text_normalize(text):
-    return text
+    res = unicodedata.normalize("NFKC", text)
+    res = japanese_convert_numbers_to_words(res)
+    # res = "".join([i for i in res if is_japanese_character(i)])
+    res = replace_punctuation(res)
+    return res
+
+
+def distribute_phone(n_phone, n_word):
+    phones_per_word = [0] * n_word
+    for task in range(n_phone):
+        min_tasks = min(phones_per_word)
+        min_index = phones_per_word.index(min_tasks)
+        phones_per_word[min_index] += 1
+    return phones_per_word
 
 
 def g2p(norm_text):
-    phones = preprocess_jap(norm_text)
-    phones = [post_replace_ph(i) for i in phones]
+    tokenized = tokenizer.tokenize(norm_text)
+    phs = []
+    ph_groups = []
+    for t in tokenized:
+        if not t.startswith("#"):
+            ph_groups.append([t])
+        else:
+            ph_groups[-1].append(t.replace("#", ""))
+    word2ph = []
+    for group in ph_groups:
+        phonemes = kata2phoneme(text2kata("".join(group)))
+        # phonemes = [i for i in phonemes if i in symbols]
+        for i in phonemes:
+            assert i in symbols, (group, norm_text, tokenized)
+        phone_len = len(phonemes)
+        word_len = len(group)
+
+        aaa = distribute_phone(phone_len, word_len)
+        word2ph += aaa
 
+        phs += phonemes
+    phones = ["_"] + phs + ["_"]
     tones = [0 for i in phones]
-    word2ph = [1 for i in phones]
+    word2ph = [1] + word2ph + [1]
     return phones, tones, word2ph
 
 
-if __name__ == '__main__':
-    for line in open("../../../Downloads/transcript_utf8.txt").readlines():
-        text = line.split(":")[1]
-        phones, tones, word2ph = g2p(text)
-        for p in phones:
-            if p == "z":
-                print(text, phones)
-                sys.exit(0)
+if __name__ == "__main__":
+    tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3")
+    text = "hello,こんにちは、世界！……"
+    from bert_vits2.text.japanese_bert import get_bert_feature
+
+    text = text_normalize(text)
+    print(text)
+    phones, tones, word2ph = g2p(text)
+    bert = get_bert_feature(text, word2ph)
+
+    print(phones, tones, word2ph, bert.shape)
diff --git a/bert_vits2/text/symbols.py b/bert_vits2/text/symbols.py
index fd3d5db..a964a9f 100644
--- a/bert_vits2/text/symbols.py
+++ b/bert_vits2/text/symbols.py
@@ -1,52 +1,200 @@
-punctuation = ['!', '?', '…', ",", ".", "'", '-']
+punctuation = ["!", "?", "…", ",", ".", "'", "-"]
 pu_symbols = punctuation + ["SP", "UNK"]
-pad = '_'
+pad = "_"
 
 # chinese
-zh_symbols = ['E', 'En', 'a', 'ai', 'an', 'ang', 'ao', 'b', 'c', 'ch', 'd', 'e', 'ei', 'en', 'eng', 'er', 'f', 'g', 'h',
-              'i', 'i0', 'ia', 'ian', 'iang', 'iao', 'ie', 'in', 'ing', 'iong', 'ir', 'iu', 'j', 'k', 'l', 'm', 'n',
-              'o',
-              'ong',
-              'ou', 'p', 'q', 'r', 's', 'sh', 't', 'u', 'ua', 'uai', 'uan', 'uang', 'ui', 'un', 'uo', 'v', 'van', 've',
-              'vn',
-              'w', 'x', 'y', 'z', 'zh',
-              "AA", "EE", "OO"]
+zh_symbols = [
+    "E",
+    "En",
+    "a",
+    "ai",
+    "an",
+    "ang",
+    "ao",
+    "b",
+    "c",
+    "ch",
+    "d",
+    "e",
+    "ei",
+    "en",
+    "eng",
+    "er",
+    "f",
+    "g",
+    "h",
+    "i",
+    "i0",
+    "ia",
+    "ian",
+    "iang",
+    "iao",
+    "ie",
+    "in",
+    "ing",
+    "iong",
+    "ir",
+    "iu",
+    "j",
+    "k",
+    "l",
+    "m",
+    "n",
+    "o",
+    "ong",
+    "ou",
+    "p",
+    "q",
+    "r",
+    "s",
+    "sh",
+    "t",
+    "u",
+    "ua",
+    "uai",
+    "uan",
+    "uang",
+    "ui",
+    "un",
+    "uo",
+    "v",
+    "van",
+    "ve",
+    "vn",
+    "w",
+    "x",
+    "y",
+    "z",
+    "zh",
+    "AA",
+    "EE",
+    "OO",
+]
 num_zh_tones = 6
 
 # japanese
-ja_symbols = ['I', 'N', 'U', 'a', 'b', 'by', 'ch', 'cl', 'd', 'dy', 'e', 'f', 'g', 'gy', 'h', 'hy', 'i', 'j', 'k', 'ky',
-              'm', 'my', 'n', 'ny', 'o', 'p', 'py', 'r', 'ry', 's', 'sh', 't', 'ts', 'u', 'V', 'w', 'y', 'z']
+ja_symbols_legacy = ['I', 'N', 'U', 'a', 'b', 'by', 'ch', 'cl', 'd', 'dy', 'e', 'f', 'g', 'gy', 'h', 'hy', 'i', 'j',
+                     'k', 'ky',
+                     'm', 'my', 'n', 'ny', 'o', 'p', 'py', 'r', 'ry', 's', 'sh', 't', 'ts', 'u', 'V', 'w', 'y', 'z']
+ja_symbols = [
+    "N",
+    "a",
+    "a:",
+    "b",
+    "by",
+    "ch",
+    "d",
+    "dy",
+    "e",
+    "e:",
+    "f",
+    "g",
+    "gy",
+    "h",
+    "hy",
+    "i",
+    "i:",
+    "j",
+    "k",
+    "ky",
+    "m",
+    "my",
+    "n",
+    "ny",
+    "o",
+    "o:",
+    "p",
+    "py",
+    "q",
+    "r",
+    "ry",
+    "s",
+    "sh",
+    "t",
+    "ts",
+    "ty",
+    "u",
+    "u:",
+    "w",
+    "y",
+    "z",
+    "zy",
+]
 num_ja_tones = 1
 
 # English
-en_symbols = ['aa', 'ae', 'ah', 'ao', 'aw', 'ay', 'b', 'ch', 'd', 'dh', 'eh', 'er', 'ey', 'f', 'g', 'hh', 'ih', 'iy',
-              'jh', 'k', 'l', 'm', 'n', 'ng', 'ow', 'oy', 'p', 'r', 's',
-              'sh', 't', 'th', 'uh', 'uw', 'V', 'w', 'y', 'z', 'zh']
+en_symbols = [
+    "aa",
+    "ae",
+    "ah",
+    "ao",
+    "aw",
+    "ay",
+    "b",
+    "ch",
+    "d",
+    "dh",
+    "eh",
+    "er",
+    "ey",
+    "f",
+    "g",
+    "hh",
+    "ih",
+    "iy",
+    "jh",
+    "k",
+    "l",
+    "m",
+    "n",
+    "ng",
+    "ow",
+    "oy",
+    "p",
+    "r",
+    "s",
+    "sh",
+    "t",
+    "th",
+    "uh",
+    "uw",
+    "V",
+    "w",
+    "y",
+    "z",
+    "zh",
+]
 num_en_tones = 4
 
-# combine all symbols
-normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
-symbols = [pad] + normal_symbols + pu_symbols
-sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
-
-# combine all tones
-num_tones = num_zh_tones + num_ja_tones + num_en_tones
-
-# language maps
-language_id_map = {
-    'ZH': 0,
-    "JA": 1,
-    "EN": 2
-}
-num_languages = len(language_id_map.keys())
-
-language_tone_start_map = {
-    'ZH': 0,
-    "JA": num_zh_tones,
-    "EN": num_zh_tones + num_ja_tones
-}
-
-if __name__ == '__main__':
-    a = set(zh_symbols)
-    b = set(en_symbols)
-    print(sorted(a & b))
+
+def get_symbols(legacy=False):
+    if legacy:
+        ja_symbols = ja_symbols_legacy
+    # combine all symbols
+    normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
+    symbols = [pad] + normal_symbols + pu_symbols
+    sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
+
+    # combine all tones
+    num_tones = num_zh_tones + num_ja_tones + num_en_tones
+
+    # language maps
+    language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
+    num_languages = len(language_id_map.keys())
+
+    language_tone_start_map = {
+        "ZH": 0,
+        "JP": num_zh_tones,
+        "EN": num_zh_tones + num_ja_tones,
+    }
+    return symbols, num_tones, language_id_map, num_languages, language_tone_start_map
+
+
+if __name__ == "__main__":
+    zh = set(zh_symbols)
+    en = set(en_symbols)
+    jp = set(ja_symbols)
+    print(zh)
+    print(en)
+    print(jp)
+    print(sorted(zh & en))
diff --git a/bert_vits2/text/tone_sandhi.py b/bert_vits2/text/tone_sandhi.py
index c0a78a5..6a6e4c3 100644
--- a/bert_vits2/text/tone_sandhi.py
+++ b/bert_vits2/text/tone_sandhi.py
@@ -19,51 +19,442 @@
 from pypinyin import Style
 
 
-class ToneSandhi():
+class ToneSandhi:
     def __init__(self):
         self.must_neural_tone_words = {
-            '麻烦', '麻利', '鸳鸯', '高粱', '骨头', '骆驼', '马虎', '首饰', '馒头', '馄饨', '风筝',
-            '难为', '队伍', '阔气', '闺女', '门道', '锄头', '铺盖', '铃铛', '铁匠', '钥匙', '里脊',
-            '里头', '部分', '那么', '道士', '造化', '迷糊', '连累', '这么', '这个', '运气', '过去',
-            '软和', '转悠', '踏实', '跳蚤', '跟头', '趔趄', '财主', '豆腐', '讲究', '记性', '记号',
-            '认识', '规矩', '见识', '裁缝', '补丁', '衣裳', '衣服', '衙门', '街坊', '行李', '行当',
-            '蛤蟆', '蘑菇', '薄荷', '葫芦', '葡萄', '萝卜', '荸荠', '苗条', '苗头', '苍蝇', '芝麻',
-            '舒服', '舒坦', '舌头', '自在', '膏药', '脾气', '脑袋', '脊梁', '能耐', '胳膊', '胭脂',
-            '胡萝', '胡琴', '胡同', '聪明', '耽误', '耽搁', '耷拉', '耳朵', '老爷', '老实', '老婆',
-            '老头', '老太', '翻腾', '罗嗦', '罐头', '编辑', '结实', '红火', '累赘', '糨糊', '糊涂',
-            '精神', '粮食', '簸箕', '篱笆', '算计', '算盘', '答应', '笤帚', '笑语', '笑话', '窟窿',
-            '窝囊', '窗户', '稳当', '稀罕', '称呼', '秧歌', '秀气', '秀才', '福气', '祖宗', '砚台',
-            '码头', '石榴', '石头', '石匠', '知识', '眼睛', '眯缝', '眨巴', '眉毛', '相声', '盘算',
-            '白净', '痢疾', '痛快', '疟疾', '疙瘩', '疏忽', '畜生', '生意', '甘蔗', '琵琶', '琢磨',
-            '琉璃', '玻璃', '玫瑰', '玄乎', '狐狸', '状元', '特务', '牲口', '牙碜', '牌楼', '爽快',
-            '爱人', '热闹', '烧饼', '烟筒', '烂糊', '点心', '炊帚', '灯笼', '火候', '漂亮', '滑溜',
-            '溜达', '温和', '清楚', '消息', '浪头', '活泼', '比方', '正经', '欺负', '模糊', '槟榔',
-            '棺材', '棒槌', '棉花', '核桃', '栅栏', '柴火', '架势', '枕头', '枇杷', '机灵', '本事',
-            '木头', '木匠', '朋友', '月饼', '月亮', '暖和', '明白', '时候', '新鲜', '故事', '收拾',
-            '收成', '提防', '挖苦', '挑剔', '指甲', '指头', '拾掇', '拳头', '拨弄', '招牌', '招呼',
-            '抬举', '护士', '折腾', '扫帚', '打量', '打算', '打点', '打扮', '打听', '打发', '扎实',
-            '扁担', '戒指', '懒得', '意识', '意思', '情形', '悟性', '怪物', '思量', '怎么', '念头',
-            '念叨', '快活', '忙活', '志气', '心思', '得罪', '张罗', '弟兄', '开通', '应酬', '庄稼',
-            '干事', '帮手', '帐篷', '希罕', '师父', '师傅', '巴结', '巴掌', '差事', '工夫', '岁数',
-            '屁股', '尾巴', '少爷', '小气', '小伙', '将就', '对头', '对付', '寡妇', '家伙', '客气',
-            '实在', '官司', '学问', '学生', '字号', '嫁妆', '媳妇', '媒人', '婆家', '娘家', '委屈',
-            '姑娘', '姐夫', '妯娌', '妥当', '妖精', '奴才', '女婿', '头发', '太阳', '大爷', '大方',
-            '大意', '大夫', '多少', '多么', '外甥', '壮实', '地道', '地方', '在乎', '困难', '嘴巴',
-            '嘱咐', '嘟囔', '嘀咕', '喜欢', '喇嘛', '喇叭', '商量', '唾沫', '哑巴', '哈欠', '哆嗦',
-            '咳嗽', '和尚', '告诉', '告示', '含糊', '吓唬', '后头', '名字', '名堂', '合同', '吆喝',
-            '叫唤', '口袋', '厚道', '厉害', '千斤', '包袱', '包涵', '匀称', '勤快', '动静', '动弹',
-            '功夫', '力气', '前头', '刺猬', '刺激', '别扭', '利落', '利索', '利害', '分析', '出息',
-            '凑合', '凉快', '冷战', '冤枉', '冒失', '养活', '关系', '先生', '兄弟', '便宜', '使唤',
-            '佩服', '作坊', '体面', '位置', '似的', '伙计', '休息', '什么', '人家', '亲戚', '亲家',
-            '交情', '云彩', '事情', '买卖', '主意', '丫头', '丧气', '两口', '东西', '东家', '世故',
-            '不由', '不在', '下水', '下巴', '上头', '上司', '丈夫', '丈人', '一辈', '那个', '菩萨',
-            '父亲', '母亲', '咕噜', '邋遢', '费用', '冤家', '甜头', '介绍', '荒唐', '大人', '泥鳅',
-            '幸福', '熟悉', '计划', '扑腾', '蜡烛', '姥爷', '照顾', '喉咙', '吉他', '弄堂', '蚂蚱',
-            '凤凰', '拖沓', '寒碜', '糟蹋', '倒腾', '报复', '逻辑', '盘缠', '喽啰', '牢骚', '咖喱',
-            '扫把', '惦记'
+            "麻烦",
+            "麻利",
+            "鸳鸯",
+            "高粱",
+            "骨头",
+            "骆驼",
+            "马虎",
+            "首饰",
+            "馒头",
+            "馄饨",
+            "风筝",
+            "难为",
+            "队伍",
+            "阔气",
+            "闺女",
+            "门道",
+            "锄头",
+            "铺盖",
+            "铃铛",
+            "铁匠",
+            "钥匙",
+            "里脊",
+            "里头",
+            "部分",
+            "那么",
+            "道士",
+            "造化",
+            "迷糊",
+            "连累",
+            "这么",
+            "这个",
+            "运气",
+            "过去",
+            "软和",
+            "转悠",
+            "踏实",
+            "跳蚤",
+            "跟头",
+            "趔趄",
+            "财主",
+            "豆腐",
+            "讲究",
+            "记性",
+            "记号",
+            "认识",
+            "规矩",
+            "见识",
+            "裁缝",
+            "补丁",
+            "衣裳",
+            "衣服",
+            "衙门",
+            "街坊",
+            "行李",
+            "行当",
+            "蛤蟆",
+            "蘑菇",
+            "薄荷",
+            "葫芦",
+            "葡萄",
+            "萝卜",
+            "荸荠",
+            "苗条",
+            "苗头",
+            "苍蝇",
+            "芝麻",
+            "舒服",
+            "舒坦",
+            "舌头",
+            "自在",
+            "膏药",
+            "脾气",
+            "脑袋",
+            "脊梁",
+            "能耐",
+            "胳膊",
+            "胭脂",
+            "胡萝",
+            "胡琴",
+            "胡同",
+            "聪明",
+            "耽误",
+            "耽搁",
+            "耷拉",
+            "耳朵",
+            "老爷",
+            "老实",
+            "老婆",
+            "老头",
+            "老太",
+            "翻腾",
+            "罗嗦",
+            "罐头",
+            "编辑",
+            "结实",
+            "红火",
+            "累赘",
+            "糨糊",
+            "糊涂",
+            "精神",
+            "粮食",
+            "簸箕",
+            "篱笆",
+            "算计",
+            "算盘",
+            "答应",
+            "笤帚",
+            "笑语",
+            "笑话",
+            "窟窿",
+            "窝囊",
+            "窗户",
+            "稳当",
+            "稀罕",
+            "称呼",
+            "秧歌",
+            "秀气",
+            "秀才",
+            "福气",
+            "祖宗",
+            "砚台",
+            "码头",
+            "石榴",
+            "石头",
+            "石匠",
+            "知识",
+            "眼睛",
+            "眯缝",
+            "眨巴",
+            "眉毛",
+            "相声",
+            "盘算",
+            "白净",
+            "痢疾",
+            "痛快",
+            "疟疾",
+            "疙瘩",
+            "疏忽",
+            "畜生",
+            "生意",
+            "甘蔗",
+            "琵琶",
+            "琢磨",
+            "琉璃",
+            "玻璃",
+            "玫瑰",
+            "玄乎",
+            "狐狸",
+            "状元",
+            "特务",
+            "牲口",
+            "牙碜",
+            "牌楼",
+            "爽快",
+            "爱人",
+            "热闹",
+            "烧饼",
+            "烟筒",
+            "烂糊",
+            "点心",
+            "炊帚",
+            "灯笼",
+            "火候",
+            "漂亮",
+            "滑溜",
+            "溜达",
+            "温和",
+            "清楚",
+            "消息",
+            "浪头",
+            "活泼",
+            "比方",
+            "正经",
+            "欺负",
+            "模糊",
+            "槟榔",
+            "棺材",
+            "棒槌",
+            "棉花",
+            "核桃",
+            "栅栏",
+            "柴火",
+            "架势",
+            "枕头",
+            "枇杷",
+            "机灵",
+            "本事",
+            "木头",
+            "木匠",
+            "朋友",
+            "月饼",
+            "月亮",
+            "暖和",
+            "明白",
+            "时候",
+            "新鲜",
+            "故事",
+            "收拾",
+            "收成",
+            "提防",
+            "挖苦",
+            "挑剔",
+            "指甲",
+            "指头",
+            "拾掇",
+            "拳头",
+            "拨弄",
+            "招牌",
+            "招呼",
+            "抬举",
+            "护士",
+            "折腾",
+            "扫帚",
+            "打量",
+            "打算",
+            "打点",
+            "打扮",
+            "打听",
+            "打发",
+            "扎实",
+            "扁担",
+            "戒指",
+            "懒得",
+            "意识",
+            "意思",
+            "情形",
+            "悟性",
+            "怪物",
+            "思量",
+            "怎么",
+            "念头",
+            "念叨",
+            "快活",
+            "忙活",
+            "志气",
+            "心思",
+            "得罪",
+            "张罗",
+            "弟兄",
+            "开通",
+            "应酬",
+            "庄稼",
+            "干事",
+            "帮手",
+            "帐篷",
+            "希罕",
+            "师父",
+            "师傅",
+            "巴结",
+            "巴掌",
+            "差事",
+            "工夫",
+            "岁数",
+            "屁股",
+            "尾巴",
+            "少爷",
+            "小气",
+            "小伙",
+            "将就",
+            "对头",
+            "对付",
+            "寡妇",
+            "家伙",
+            "客气",
+            "实在",
+            "官司",
+            "学问",
+            "学生",
+            "字号",
+            "嫁妆",
+            "媳妇",
+            "媒人",
+            "婆家",
+            "娘家",
+            "委屈",
+            "姑娘",
+            "姐夫",
+            "妯娌",
+            "妥当",
+            "妖精",
+            "奴才",
+            "女婿",
+            "头发",
+            "太阳",
+            "大爷",
+            "大方",
+            "大意",
+            "大夫",
+            "多少",
+            "多么",
+            "外甥",
+            "壮实",
+            "地道",
+            "地方",
+            "在乎",
+            "困难",
+            "嘴巴",
+            "嘱咐",
+            "嘟囔",
+            "嘀咕",
+            "喜欢",
+            "喇嘛",
+            "喇叭",
+            "商量",
+            "唾沫",
+            "哑巴",
+            "哈欠",
+            "哆嗦",
+            "咳嗽",
+            "和尚",
+            "告诉",
+            "告示",
+            "含糊",
+            "吓唬",
+            "后头",
+            "名字",
+            "名堂",
+            "合同",
+            "吆喝",
+            "叫唤",
+            "口袋",
+            "厚道",
+            "厉害",
+            "千斤",
+            "包袱",
+            "包涵",
+            "匀称",
+            "勤快",
+            "动静",
+            "动弹",
+            "功夫",
+            "力气",
+            "前头",
+            "刺猬",
+            "刺激",
+            "别扭",
+            "利落",
+            "利索",
+            "利害",
+            "分析",
+            "出息",
+            "凑合",
+            "凉快",
+            "冷战",
+            "冤枉",
+            "冒失",
+            "养活",
+            "关系",
+            "先生",
+            "兄弟",
+            "便宜",
+            "使唤",
+            "佩服",
+            "作坊",
+            "体面",
+            "位置",
+            "似的",
+            "伙计",
+            "休息",
+            "什么",
+            "人家",
+            "亲戚",
+            "亲家",
+            "交情",
+            "云彩",
+            "事情",
+            "买卖",
+            "主意",
+            "丫头",
+            "丧气",
+            "两口",
+            "东西",
+            "东家",
+            "世故",
+            "不由",
+            "不在",
+            "下水",
+            "下巴",
+            "上头",
+            "上司",
+            "丈夫",
+            "丈人",
+            "一辈",
+            "那个",
+            "菩萨",
+            "父亲",
+            "母亲",
+            "咕噜",
+            "邋遢",
+            "费用",
+            "冤家",
+            "甜头",
+            "介绍",
+            "荒唐",
+            "大人",
+            "泥鳅",
+            "幸福",
+            "熟悉",
+            "计划",
+            "扑腾",
+            "蜡烛",
+            "姥爷",
+            "照顾",
+            "喉咙",
+            "吉他",
+            "弄堂",
+            "蚂蚱",
+            "凤凰",
+            "拖沓",
+            "寒碜",
+            "糟蹋",
+            "倒腾",
+            "报复",
+            "逻辑",
+            "盘缠",
+            "喽啰",
+            "牢骚",
+            "咖喱",
+            "扫把",
+            "惦记",
         }
         self.must_not_neural_tone_words = {
-            "男子", "女子", "分子", "原子", "量子", "莲子", "石子", "瓜子", "电子", "人人", "虎虎"
+            "男子",
+            "女子",
+            "分子",
+            "原子",
+            "量子",
+            "莲子",
+            "石子",
+            "瓜子",
+            "电子",
+            "人人",
+            "虎虎",
         }
         self.punc = "：，；。？！“”‘’':,;.?!"
 
@@ -72,14 +463,15 @@ def __init__(self):
     # word: "家里"
     # pos: "s"
     # finals: ['ia1', 'i3']
-    def _neural_sandhi(self, word: str, pos: str,
-                       finals: List[str]) -> List[str]:
-
+    def _neural_sandhi(self, word: str, pos: str, finals: List[str]) -> List[str]:
         # reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺
         for j, item in enumerate(word):
-            if j - 1 >= 0 and item == word[j - 1] and pos[0] in {
-                "n", "v", "a"
-            } and word not in self.must_not_neural_tone_words:
+            if (
+                j - 1 >= 0
+                and item == word[j - 1]
+                and pos[0] in {"n", "v", "a"}
+                and word not in self.must_not_neural_tone_words
+            ):
                 finals[j] = finals[j][:-1] + "5"
         ge_idx = word.find("个")
         if len(word) >= 1 and word[-1] in "吧呢啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶":
@@ -89,9 +481,12 @@ def _neural_sandhi(self, word: str, pos: str,
         # e.g. 走了, 看着, 去过
         # elif len(word) == 1 and word in "了着过" and pos in {"ul", "uz", "ug"}:
         #     finals[-1] = finals[-1][:-1] + "5"
-        elif len(word) > 1 and word[-1] in "们子" and pos in {
-            "r", "n"
-        } and word not in self.must_not_neural_tone_words:
+        elif (
+            len(word) > 1
+            and word[-1] in "们子"
+            and pos in {"r", "n"}
+            and word not in self.must_not_neural_tone_words
+        ):
             finals[-1] = finals[-1][:-1] + "5"
         # e.g. 桌上, 地下, 家里
         elif len(word) > 1 and word[-1] in "上下里" and pos in {"s", "l", "f"}:
@@ -100,21 +495,26 @@ def _neural_sandhi(self, word: str, pos: str,
         elif len(word) > 1 and word[-1] in "来去" and word[-2] in "上下进出回过起开":
             finals[-1] = finals[-1][:-1] + "5"
         # 个做量词
-        elif (ge_idx >= 1 and
-              (word[ge_idx - 1].isnumeric() or
-               word[ge_idx - 1] in "几有两半多各整每做是")) or word == '个':
+        elif (
+            ge_idx >= 1
+            and (word[ge_idx - 1].isnumeric() or word[ge_idx - 1] in "几有两半多各整每做是")
+        ) or word == "个":
             finals[ge_idx] = finals[ge_idx][:-1] + "5"
         else:
-            if word in self.must_neural_tone_words or word[
-                                                      -2:] in self.must_neural_tone_words:
+            if (
+                word in self.must_neural_tone_words
+                or word[-2:] in self.must_neural_tone_words
+            ):
                 finals[-1] = finals[-1][:-1] + "5"
 
         word_list = self._split_word(word)
-        finals_list = [finals[:len(word_list[0])], finals[len(word_list[0]):]]
+        finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]]
         for i, word in enumerate(word_list):
             # conventional neural in Chinese
-            if word in self.must_neural_tone_words or word[
-                                                      -2:] in self.must_neural_tone_words:
+            if (
+                word in self.must_neural_tone_words
+                or word[-2:] in self.must_neural_tone_words
+            ):
                 finals_list[i][-1] = finals_list[i][-1][:-1] + "5"
         finals = sum(finals_list, [])
         return finals
@@ -126,17 +526,17 @@ def _bu_sandhi(self, word: str, finals: List[str]) -> List[str]:
         else:
             for i, char in enumerate(word):
                 # "不" before tone4 should be bu2, e.g. 不怕
-                if char == "不" and i + 1 < len(word) and finals[i +
-                                                                1][-1] == "4":
+                if char == "不" and i + 1 < len(word) and finals[i + 1][-1] == "4":
                     finals[i] = finals[i][:-1] + "2"
         return finals
 
     def _yi_sandhi(self, word: str, finals: List[str]) -> List[str]:
         # "一" in number sequences, e.g. 一零零, 二一零
         if word.find("一") != -1 and all(
-                [item.isnumeric() for item in word if item != "一"]):
+            [item.isnumeric() for item in word if item != "一"]
+        ):
             return finals
-        # "一" between reduplication words shold be yi5, e.g. 看一看
+        # "一" between reduplication words should be yi5, e.g. 看一看
         elif len(word) == 3 and word[1] == "一" and word[0] == word[-1]:
             finals[1] = finals[1][:-1] + "5"
         # when "一" is ordinal word, it should be yi1
@@ -161,10 +561,10 @@ def _split_word(self, word: str) -> List[str]:
         first_subword = word_list[0]
         first_begin_idx = word.find(first_subword)
         if first_begin_idx == 0:
-            second_subword = word[len(first_subword):]
+            second_subword = word[len(first_subword) :]
             new_word_list = [first_subword, second_subword]
         else:
-            second_subword = word[:-len(first_subword)]
+            second_subword = word[: -len(first_subword)]
             new_word_list = [second_subword, first_subword]
         return new_word_list
 
@@ -182,18 +582,19 @@ def _three_sandhi(self, word: str, finals: List[str]) -> List[str]:
                 elif len(word_list[0]) == 1:
                     finals[1] = finals[1][:-1] + "2"
             else:
-                finals_list = [
-                    finals[:len(word_list[0])], finals[len(word_list[0]):]
-                ]
+                finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]]
                 if len(finals_list) == 2:
                     for i, sub in enumerate(finals_list):
                         # e.g. 所有/人
                         if self._all_tone_three(sub) and len(sub) == 2:
                             finals_list[i][0] = finals_list[i][0][:-1] + "2"
                         # e.g. 好/喜欢
-                        elif i == 1 and not self._all_tone_three(sub) and finals_list[i][0][-1] == "3" and \
-                                finals_list[0][-1][-1] == "3":
-
+                        elif (
+                            i == 1
+                            and not self._all_tone_three(sub)
+                            and finals_list[i][0][-1] == "3"
+                            and finals_list[0][-1][-1] == "3"
+                        ):
                             finals_list[0][-1] = finals_list[0][-1][:-1] + "2"
                         finals = sum(finals_list, [])
         # split idiom into two words who's length is 2
@@ -222,7 +623,7 @@ def _merge_bu(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
                 new_seg.append((word, pos))
             last_word = word[:]
         if last_word == "不":
-            new_seg.append((last_word, 'd'))
+            new_seg.append((last_word, "d"))
             last_word = ""
         return new_seg
 
@@ -236,12 +637,21 @@ def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
         new_seg = []
         # function 1
         for i, (word, pos) in enumerate(seg):
-            if i - 1 >= 0 and word == "一" and i + 1 < len(seg) and seg[i - 1][
-                0] == seg[i + 1][0] and seg[i - 1][1] == "v":
+            if (
+                i - 1 >= 0
+                and word == "一"
+                and i + 1 < len(seg)
+                and seg[i - 1][0] == seg[i + 1][0]
+                and seg[i - 1][1] == "v"
+            ):
                 new_seg[i - 1][0] = new_seg[i - 1][0] + "一" + new_seg[i - 1][0]
             else:
-                if i - 2 >= 0 and seg[i - 1][0] == "一" and seg[i - 2][
-                    0] == word and pos == "v":
+                if (
+                    i - 2 >= 0
+                    and seg[i - 1][0] == "一"
+                    and seg[i - 2][0] == word
+                    and pos == "v"
+                ):
                     continue
                 else:
                     new_seg.append([word, pos])
@@ -257,22 +667,27 @@ def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
 
     # the first and the second words are all_tone_three
     def _merge_continuous_three_tones(
-            self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
+        self, seg: List[Tuple[str, str]]
+    ) -> List[Tuple[str, str]]:
         new_seg = []
         sub_finals_list = [
-            lazy_pinyin(
-                word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
+            lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
             for (word, pos) in seg
         ]
         assert len(sub_finals_list) == len(seg)
         merge_last = [False] * len(seg)
         for i, (word, pos) in enumerate(seg):
-            if i - 1 >= 0 and self._all_tone_three(
-                    sub_finals_list[i - 1]) and self._all_tone_three(
-                sub_finals_list[i]) and not merge_last[i - 1]:
+            if (
+                i - 1 >= 0
+                and self._all_tone_three(sub_finals_list[i - 1])
+                and self._all_tone_three(sub_finals_list[i])
+                and not merge_last[i - 1]
+            ):
                 # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
-                if not self._is_reduplication(seg[i - 1][0]) and len(
-                        seg[i - 1][0]) + len(seg[i][0]) <= 3:
+                if (
+                    not self._is_reduplication(seg[i - 1][0])
+                    and len(seg[i - 1][0]) + len(seg[i][0]) <= 3
+                ):
                     new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
                     merge_last[i] = True
                 else:
@@ -287,21 +702,27 @@ def _is_reduplication(self, word: str) -> bool:
 
     # the last char of first word and the first char of second word is tone_three
     def _merge_continuous_three_tones_2(
-            self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
+        self, seg: List[Tuple[str, str]]
+    ) -> List[Tuple[str, str]]:
         new_seg = []
         sub_finals_list = [
-            lazy_pinyin(
-                word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
+            lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
             for (word, pos) in seg
         ]
         assert len(sub_finals_list) == len(seg)
         merge_last = [False] * len(seg)
         for i, (word, pos) in enumerate(seg):
-            if i - 1 >= 0 and sub_finals_list[i - 1][-1][-1] == "3" and sub_finals_list[i][0][-1] == "3" and not \
-                    merge_last[i - 1]:
+            if (
+                i - 1 >= 0
+                and sub_finals_list[i - 1][-1][-1] == "3"
+                and sub_finals_list[i][0][-1] == "3"
+                and not merge_last[i - 1]
+            ):
                 # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
-                if not self._is_reduplication(seg[i - 1][0]) and len(
-                        seg[i - 1][0]) + len(seg[i][0]) <= 3:
+                if (
+                    not self._is_reduplication(seg[i - 1][0])
+                    and len(seg[i - 1][0]) + len(seg[i][0]) <= 3
+                ):
                     new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
                     merge_last[i] = True
                 else:
@@ -319,8 +740,7 @@ def _merge_er(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
                 new_seg.append([word, pos])
         return new_seg
 
-    def _merge_reduplication(
-            self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
+    def _merge_reduplication(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
         new_seg = []
         for i, (word, pos) in enumerate(seg):
             if new_seg and word == new_seg[-1][0]:
@@ -329,8 +749,7 @@ def _merge_reduplication(
                 new_seg.append([word, pos])
         return new_seg
 
-    def pre_merge_for_modify(
-            self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
+    def pre_merge_for_modify(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
         seg = self._merge_bu(seg)
         try:
             seg = self._merge_yi(seg)
@@ -342,8 +761,7 @@ def pre_merge_for_modify(
         seg = self._merge_er(seg)
         return seg
 
-    def modified_tone(self, word: str, pos: str,
-                      finals: List[str]) -> List[str]:
+    def modified_tone(self, word: str, pos: str, finals: List[str]) -> List[str]:
         finals = self._bu_sandhi(word, finals)
         finals = self._yi_sandhi(word, finals)
         finals = self._neural_sandhi(word, pos, finals)
diff --git a/config.py b/config.py
index 252040a..5ef6a73 100644
--- a/config.py
+++ b/config.py
@@ -1,6 +1,8 @@
 import os
 import sys
 
+import torch
+
 JSON_AS_ASCII = False
 
 MAX_CONTENT_LENGTH = 5242880
@@ -79,6 +81,8 @@
 # w2v2-vits: Need to have both `model.onnx` and `model.yaml` files in the same path.
 # DIMENSIONAL_EMOTION_MODEL = ABS_PATH + "/Model/model.yaml"
 
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
 """
 Default parameter
 """
diff --git a/requirements.txt b/requirements.txt
index b425cfc..efbbe65 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -28,4 +28,5 @@ fastlid
 langid
 phonemizer==3.2.1
 transformers
-pydantic==2.3.0
\ No newline at end of file
+pydantic==2.3.0
+num2words
\ No newline at end of file
diff --git a/utils/merge.py b/utils/load_model.py
similarity index 100%
rename from utils/merge.py
rename to utils/load_model.py
diff --git a/utils/nlp.py b/utils/sentence.py
similarity index 100%
rename from utils/nlp.py
rename to utils/sentence.py
diff --git a/vits/vits.py b/vits/vits.py
index 21c06c3..cc7cd3d 100644
--- a/vits/vits.py
+++ b/vits/vits.py
@@ -4,7 +4,7 @@
 import numpy as np
 import torch
 from torch import no_grad, LongTensor, inference_mode, FloatTensor
-from utils.nlp import sentence_split
+from utils.sentence import sentence_split
 from vits.mel_processing import spectrogram_torch
 from vits.text import text_to_sequence
 from vits.models import SynthesizerTrn