diff --git a/TTSManager.py b/TTSManager.py index d482646..658a60e 100644 --- a/TTSManager.py +++ b/TTSManager.py @@ -244,6 +244,18 @@ def vits_infer(self, state, encode=True): audio = np.concatenate(audios, axis=0) return self.encode(sampling_rate, audio, state["format"]) if encode else audio + def stream_vits_infer(self, task, fname=None): + format = task.get("format", "wav") + voice_obj = self._voice_obj[ModelType.VITS][task.get("id")][1] + task["id"] = self._voice_obj[ModelType.VITS][task.get("id")][0] + sampling_rate = voice_obj.sampling_rate + genertator = voice_obj.get_stream_audio(task, auto_break=True) + # audio = BytesIO() + for chunk in genertator: + encoded_audio = self.encode(sampling_rate, chunk, format) + for encoded_audio_chunk in self.generate_audio_chunks(encoded_audio): + yield encoded_audio_chunk + def stream_vits_infer(self, state, fname=None): model = self.get_model(ModelType.VITS, state["id"]) state["id"] = self.get_real_id(ModelType.VITS, state["id"]) @@ -266,11 +278,11 @@ def stream_vits_infer(self, state, fname=None): if i < sentences_num - 1: audios.append(brk) - audio = np.concatenate(audios, axis=0) - encoded_audio = self.encode(sampling_rate, audio, state["format"]) + audio = np.concatenate(audios, axis=0) + encoded_audio = self.encode(sampling_rate, audio, state["format"]) - for encoded_audio_chunk in self.generate_audio_chunks(encoded_audio): - yield encoded_audio_chunk + for encoded_audio_chunk in self.generate_audio_chunks(encoded_audio): + yield encoded_audio_chunk # if getattr(config, "SAVE_AUDIO", False): # audio.write(encoded_audio.getvalue()) # if getattr(config, "SAVE_AUDIO", False): @@ -357,3 +369,28 @@ def bert_vits2_infer(self, state, encode=True): audio = np.concatenate(audios) return self.encode(sampling_rate, audio, state["format"]) if encode else audio + + def stream_bert_vits2_infer(self, state, fname=None): + model = self.get_model(ModelType.BERT_VITS2, state["id"]) + state["id"] = self.get_real_id(ModelType.BERT_VITS2, state["id"]) + + # 去除所有多余的空白字符 + if state["text"] is not None: + state["text"] = re.sub(r'\s+', ' ', state["text"]).strip() + sampling_rate = model.sampling_rate + + sentences_list = split_by_language(state["text"], state["speaker_lang"]) + + # audios = [] + + for (text, lang) in sentences_list: + sentences = sentence_split(text, state["max"]) + for sentence in sentences: + audio = model.infer(sentence, state["id"], lang, state["sdp_ratio"], state["noise"], + state["noise"], state["length"]) + # audios.append(audio) + # audio = np.concatenate(audios, axis=0) + encoded_audio = self.encode(sampling_rate, audio, state["format"]) + + for encoded_audio_chunk in self.generate_audio_chunks(encoded_audio): + yield encoded_audio_chunk diff --git a/tts_app/templates/pages/index.html b/tts_app/templates/pages/index.html index 6722d5e..8138d4a 100644 --- a/tts_app/templates/pages/index.html +++ b/tts_app/templates/pages/index.html @@ -295,6 +295,11 @@