diff --git a/crates/voicevox_core/src/__internal.rs b/crates/voicevox_core/src/__internal.rs index ff9f5ce3c..a23dcf086 100644 --- a/crates/voicevox_core/src/__internal.rs +++ b/crates/voicevox_core/src/__internal.rs @@ -1,4 +1,5 @@ pub mod doctest_fixtures; +pub mod interop; // VOICEVOX CORE内のラッパー向けの実装 // FIXME: 要議論: https://github.com/VOICEVOX/voicevox_core/issues/595 diff --git a/crates/voicevox_core/src/__internal/interop.rs b/crates/voicevox_core/src/__internal/interop.rs new file mode 100644 index 000000000..d76df9d88 --- /dev/null +++ b/crates/voicevox_core/src/__internal/interop.rs @@ -0,0 +1 @@ +pub use crate::synthesizer::PerformInference; diff --git a/crates/voicevox_core/src/engine/open_jtalk.rs b/crates/voicevox_core/src/engine/open_jtalk.rs index fa2e91304..e45f9a5fa 100644 --- a/crates/voicevox_core/src/engine/open_jtalk.rs +++ b/crates/voicevox_core/src/engine/open_jtalk.rs @@ -52,7 +52,7 @@ impl OpenJtalk { pub async fn new(open_jtalk_dict_dir: impl AsRef) -> crate::result::Result { let open_jtalk_dict_dir = open_jtalk_dict_dir.as_ref().to_owned(); - tokio::task::spawn_blocking(move || { + crate::task::asyncify(move || { let mut s = Self::new_without_dic(); s.load(open_jtalk_dict_dir).map_err(|()| { // FIXME: 「システム辞書を読もうとしたけど読めなかった」というエラーをちゃんと用意する @@ -61,7 +61,6 @@ impl OpenJtalk { Ok(s) }) .await - .unwrap() } // 先に`load`を呼ぶ必要がある。 @@ -80,7 +79,7 @@ impl OpenJtalk { let words = user_dict.to_mecab_format(); - let result = tokio::task::spawn_blocking(move || -> crate::Result<_> { + let result = crate::task::asyncify(move || -> crate::Result<_> { // ユーザー辞書用のcsvを作成 let mut temp_csv = NamedTempFile::new().map_err(|e| ErrorRepr::UseUserDict(e.into()))?; @@ -111,8 +110,7 @@ impl OpenJtalk { Ok(mecab.load_with_userdic(dict_dir.as_ref(), Some(Path::new(&temp_dict_path)))) }) - .await - .unwrap()?; + .await?; if !result { return Err(ErrorRepr::UseUserDict(anyhow!("辞書のコンパイルに失敗しました")).into()); diff --git a/crates/voicevox_core/src/infer/status.rs b/crates/voicevox_core/src/infer/status.rs index 7903cb8ff..62805d37a 100644 --- a/crates/voicevox_core/src/infer/status.rs +++ b/crates/voicevox_core/src/infer/status.rs @@ -90,10 +90,16 @@ impl Status { self.is_loaded_model_by_style_id(style_id) } + /// 推論を実行する。 + /// + /// # Performance + /// + /// CPU/GPU-boundな操作であるため、非同期ランタイム上では直接実行されるべきではない。 + /// /// # Panics /// /// `self`が`model_id`を含んでいないとき、パニックする。 - pub(crate) async fn run_session( + pub(crate) fn run_session( &self, model_id: &VoiceModelId, input: I, @@ -103,10 +109,7 @@ impl Status { I::Signature: InferenceSignature, { let sess = self.loaded_models.lock().unwrap().get(model_id); - - tokio::task::spawn_blocking(move || sess.run(input)) - .await - .unwrap() + sess.run(input) } } diff --git a/crates/voicevox_core/src/lib.rs b/crates/voicevox_core/src/lib.rs index 5bed99fdd..78552a9f8 100644 --- a/crates/voicevox_core/src/lib.rs +++ b/crates/voicevox_core/src/lib.rs @@ -11,11 +11,11 @@ mod metas; mod numerics; mod result; mod synthesizer; +mod task; mod user_dict; mod version; mod voice_model; -#[doc(hidden)] pub mod __internal; #[cfg(test)] @@ -31,7 +31,9 @@ pub use self::result::*; pub use self::voice_model::*; pub use devices::*; pub use manifest::*; -pub use synthesizer::*; +pub use synthesizer::{ + AccelerationMode, InitializeOptions, SynthesisOptions, Synthesizer, TtsOptions, +}; pub use user_dict::*; pub use version::*; diff --git a/crates/voicevox_core/src/synthesizer.rs b/crates/voicevox_core/src/synthesizer.rs index bb4afb153..383e8f6c0 100644 --- a/crates/voicevox_core/src/synthesizer.rs +++ b/crates/voicevox_core/src/synthesizer.rs @@ -3,6 +3,7 @@ use std::{ sync::Arc, }; +use easy_ext::ext; use enum_map::enum_map; use crate::{ @@ -11,9 +12,8 @@ use crate::{ }, infer::{ domain::{ - DecodeInput, DecodeOutput, InferenceDomainImpl, InferenceOperationImpl, - PredictDurationInput, PredictDurationOutput, PredictIntonationInput, - PredictIntonationOutput, + DecodeInput, DecodeOutput, InferenceOperationImpl, PredictDurationInput, + PredictDurationOutput, PredictIntonationInput, PredictIntonationOutput, }, runtimes::Onnxruntime, status::Status, @@ -27,6 +27,7 @@ use super::*; /// [`Synthesizer::synthesis`]のオプション。 /// /// [`Synthesizer::synthesis`]: Synthesizer::synthesis +#[derive(Clone)] pub struct SynthesisOptions { pub enable_interrogative_upspeak: bool, } @@ -48,6 +49,7 @@ impl From<&TtsOptions> for SynthesisOptions { /// [`Synthesizer::tts`]のオプション。 /// /// [`Synthesizer::tts`]: Synthesizer::tts +#[derive(Clone)] pub struct TtsOptions { pub enable_interrogative_upspeak: bool, } @@ -92,13 +94,160 @@ const DEFAULT_SAMPLING_RATE: u32 = 24000; pub(crate) type InferenceRuntimeImpl = Onnxruntime; /// 音声シンセサイザ。 -pub struct Synthesizer { - status: Status, - open_jtalk: Arc, - use_gpu: bool, -} +#[derive(Clone)] +pub struct Synthesizer(Arc); +// FIXME: docを書く impl Synthesizer { + pub fn new(open_jtalk: Arc, options: &InitializeOptions) -> Result { + blocking::Synthesizer::new(open_jtalk, options) + .map(Into::into) + .map(Self) + } + + pub fn is_gpu_mode(&self) -> bool { + self.0.is_gpu_mode() + } + + pub async fn load_voice_model(&self, model: &VoiceModel) -> Result<()> { + self.0.load_voice_model(model).await + } + + pub fn unload_voice_model(&self, voice_model_id: &VoiceModelId) -> Result<()> { + self.0.unload_voice_model(voice_model_id) + } + + pub fn is_loaded_voice_model(&self, voice_model_id: &VoiceModelId) -> bool { + self.0.is_loaded_voice_model(voice_model_id) + } + + #[doc(hidden)] + pub fn is_loaded_model_by_style_id(&self, style_id: StyleId) -> bool { + self.0.is_loaded_model_by_style_id(style_id) + } + + pub fn metas(&self) -> VoiceModelMeta { + self.0.metas() + } + + pub async fn synthesis( + &self, + audio_query: &AudioQueryModel, + style_id: StyleId, + options: &SynthesisOptions, + ) -> Result> { + let blocking = self.0.clone(); + let audio_query = audio_query.clone(); + let options = options.clone(); + + crate::task::asyncify(move || blocking.synthesis(&audio_query, style_id, &options)).await + } + + pub async fn create_accent_phrases_from_kana( + &self, + kana: &str, + style_id: StyleId, + ) -> Result> { + let blocking = self.0.clone(); + let kana = kana.to_owned(); + + crate::task::asyncify(move || blocking.create_accent_phrases_from_kana(&kana, style_id)) + .await + } + + pub async fn create_accent_phrases( + &self, + text: &str, + style_id: StyleId, + ) -> Result> { + let blocking = self.0.clone(); + let text = text.to_owned(); + + crate::task::asyncify(move || blocking.create_accent_phrases(&text, style_id)).await + } + + pub async fn replace_mora_data( + &self, + accent_phrases: &[AccentPhraseModel], + style_id: StyleId, + ) -> Result> { + let blocking = self.0.clone(); + let accent_phrases = accent_phrases.to_owned(); + + crate::task::asyncify(move || blocking.replace_mora_data(&accent_phrases, style_id)).await + } + + pub async fn replace_phoneme_length( + &self, + accent_phrases: &[AccentPhraseModel], + style_id: StyleId, + ) -> Result> { + let blocking = self.0.clone(); + let accent_phrases = accent_phrases.to_owned(); + + crate::task::asyncify(move || blocking.replace_phoneme_length(&accent_phrases, style_id)) + .await + } + + pub async fn replace_mora_pitch( + &self, + accent_phrases: &[AccentPhraseModel], + style_id: StyleId, + ) -> Result> { + let blocking = self.0.clone(); + let accent_phrases = accent_phrases.to_owned(); + + crate::task::asyncify(move || blocking.replace_mora_pitch(&accent_phrases, style_id)).await + } + + pub async fn audio_query_from_kana( + &self, + kana: &str, + style_id: StyleId, + ) -> Result { + let blocking = self.0.clone(); + let kana = kana.to_owned(); + + crate::task::asyncify(move || blocking.audio_query_from_kana(&kana, style_id)).await + } + + pub async fn audio_query(&self, text: &str, style_id: StyleId) -> Result { + let blocking = self.0.clone(); + let text = text.to_owned(); + + crate::task::asyncify(move || blocking.audio_query(&text, style_id)).await + } + + pub async fn tts_from_kana( + &self, + kana: &str, + style_id: StyleId, + options: &TtsOptions, + ) -> Result> { + let blocking = self.0.clone(); + let kana = kana.to_owned(); + let options = options.clone(); + + crate::task::asyncify(move || blocking.tts_from_kana(&kana, style_id, &options)).await + } + + pub async fn tts( + &self, + text: &str, + style_id: StyleId, + options: &TtsOptions, + ) -> Result> { + let blocking = self.0.clone(); + let text = text.to_owned(); + let options = options.clone(); + + crate::task::asyncify(move || blocking.tts(&text, style_id, &options)).await + } +} + +// FIXME: ここのdocのコードブロックはasync版のものなので、↑の方に移した上で、(ブロッキング版を +// public APIにするならの話ではあるが)ブロッキング版はブロッキング版でコード例を用意する +impl blocking::Synthesizer { /// `Synthesizer`をコンストラクトする。 /// /// # Example @@ -126,7 +275,7 @@ impl Synthesizer { /// # Ok(()) /// # } /// ``` - pub fn new(open_jtalk: Arc, options: &InitializeOptions) -> Result { + fn new(open_jtalk: Arc, options: &InitializeOptions) -> Result { #[cfg(windows)] list_windows_video_cards(); @@ -183,38 +332,38 @@ impl Synthesizer { } /// ハードウェアアクセラレーションがGPUモードか判定する。 - pub fn is_gpu_mode(&self) -> bool { + fn is_gpu_mode(&self) -> bool { self.use_gpu } + // FIXME: ブロッキング版を作る /// 音声モデルを読み込む。 - pub async fn load_voice_model(&self, model: &VoiceModel) -> Result<()> { + async fn load_voice_model(&self, model: &VoiceModel) -> Result<()> { let model_bytes = &model.read_inference_models().await?; self.status.load_model(model, model_bytes).await } /// 音声モデルの読み込みを解除する。 - pub fn unload_voice_model(&self, voice_model_id: &VoiceModelId) -> Result<()> { + fn unload_voice_model(&self, voice_model_id: &VoiceModelId) -> Result<()> { self.status.unload_model(voice_model_id) } /// 指定したIDの音声モデルが読み込まれているか判定する。 - pub fn is_loaded_voice_model(&self, voice_model_id: &VoiceModelId) -> bool { + fn is_loaded_voice_model(&self, voice_model_id: &VoiceModelId) -> bool { self.status.is_loaded_model(voice_model_id) } - #[doc(hidden)] - pub fn is_loaded_model_by_style_id(&self, style_id: StyleId) -> bool { + fn is_loaded_model_by_style_id(&self, style_id: StyleId) -> bool { self.status.is_loaded_model_by_style_id(style_id) } /// 今読み込んでいる音声モデルのメタ情報を返す。 - pub fn metas(&self) -> VoiceModelMeta { + fn metas(&self) -> VoiceModelMeta { self.status.metas() } /// AudioQueryから音声合成を行う。 - pub async fn synthesis( + fn synthesis( &self, audio_query: &AudioQueryModel, style_id: StyleId, @@ -315,15 +464,13 @@ impl Synthesizer { // 2次元のvectorを1次元に変換し、アドレスを連続させる let flatten_phoneme = phoneme.into_iter().flatten().collect::>(); - let wave = &self - .decode( - f0.len(), - OjtPhoneme::num_phoneme(), - &f0, - &flatten_phoneme, - style_id, - ) - .await?; + let wave = &self.decode( + f0.len(), + OjtPhoneme::num_phoneme(), + &f0, + &flatten_phoneme, + style_id, + )?; return Ok(to_wav(wave, audio_query)); fn adjust_interrogative_accent_phrases( @@ -445,12 +592,12 @@ impl Synthesizer { /// # Ok(()) /// # } /// ``` - pub async fn create_accent_phrases_from_kana( + fn create_accent_phrases_from_kana( &self, kana: &str, style_id: StyleId, ) -> Result> { - self.replace_mora_data(&parse_kana(kana)?, style_id).await + self.replace_mora_data(&parse_kana(kana)?, style_id) } /// 日本語のテキストからAccentPhrase (アクセント句)の配列を生成する。 @@ -476,7 +623,7 @@ impl Synthesizer { /// # Ok(()) /// # } /// ``` - pub async fn create_accent_phrases( + fn create_accent_phrases( &self, text: &str, style_id: StyleId, @@ -554,23 +701,21 @@ impl Synthesizer { accum_vec }); - self.replace_mora_data(&accent_phrases, style_id).await + self.replace_mora_data(&accent_phrases, style_id) } /// AccentPhraseの配列の音高・音素長を、特定の声で生成しなおす。 - pub async fn replace_mora_data( + fn replace_mora_data( &self, accent_phrases: &[AccentPhraseModel], style_id: StyleId, ) -> Result> { - let accent_phrases = self - .replace_phoneme_length(accent_phrases, style_id) - .await?; - self.replace_mora_pitch(&accent_phrases, style_id).await + let accent_phrases = self.replace_phoneme_length(accent_phrases, style_id)?; + self.replace_mora_pitch(&accent_phrases, style_id) } /// AccentPhraseの配列の音素長を、特定の声で生成しなおす。 - pub async fn replace_phoneme_length( + fn replace_phoneme_length( &self, accent_phrases: &[AccentPhraseModel], style_id: StyleId, @@ -583,7 +728,7 @@ impl Synthesizer { .iter() .map(|phoneme_data| phoneme_data.phoneme_id()) .collect(); - let phoneme_length = self.predict_duration(&phoneme_list_s, style_id).await?; + let phoneme_length = self.predict_duration(&phoneme_list_s, style_id)?; let mut index = 0; let new_accent_phrases = accent_phrases @@ -630,7 +775,7 @@ impl Synthesizer { } /// AccentPhraseの配列の音高を、特定の声で生成しなおす。 - pub async fn replace_mora_pitch( + fn replace_mora_pitch( &self, accent_phrases: &[AccentPhraseModel], style_id: StyleId, @@ -679,18 +824,16 @@ impl Synthesizer { end_accent_phrase_list.push(base_end_accent_phrase_list[vowel_index as usize]); } - let mut f0_list = self - .predict_intonation( - vowel_phoneme_list.len(), - &vowel_phoneme_list, - &consonant_phoneme_list, - &start_accent_list, - &end_accent_list, - &start_accent_phrase_list, - &end_accent_phrase_list, - style_id, - ) - .await?; + let mut f0_list = self.predict_intonation( + vowel_phoneme_list.len(), + &vowel_phoneme_list, + &consonant_phoneme_list, + &start_accent_list, + &end_accent_list, + &start_accent_phrase_list, + &end_accent_phrase_list, + style_id, + )?; for i in 0..vowel_phoneme_data_list.len() { const UNVOICED_MORA_PHONEME_LIST: &[&str] = &["A", "I", "U", "E", "O", "cl", "pau"]; @@ -792,12 +935,8 @@ impl Synthesizer { /// ``` /// /// [AudioQuery]: crate::AudioQueryModel - pub async fn audio_query_from_kana( - &self, - kana: &str, - style_id: StyleId, - ) -> Result { - let accent_phrases = self.create_accent_phrases_from_kana(kana, style_id).await?; + fn audio_query_from_kana(&self, kana: &str, style_id: StyleId) -> Result { + let accent_phrases = self.create_accent_phrases_from_kana(kana, style_id)?; Ok(AudioQueryModel::from_accent_phrases(accent_phrases).with_kana(Some(kana.to_owned()))) } @@ -826,45 +965,78 @@ impl Synthesizer { /// ``` /// /// [AudioQuery]: crate::AudioQueryModel - pub async fn audio_query(&self, text: &str, style_id: StyleId) -> Result { - let accent_phrases = self.create_accent_phrases(text, style_id).await?; + fn audio_query(&self, text: &str, style_id: StyleId) -> Result { + let accent_phrases = self.create_accent_phrases(text, style_id)?; Ok(AudioQueryModel::from_accent_phrases(accent_phrases)) } /// AquesTalk風記法から音声合成を行う。 - pub async fn tts_from_kana( + fn tts_from_kana( &self, kana: &str, style_id: StyleId, options: &TtsOptions, ) -> Result> { - let audio_query = &self.audio_query_from_kana(kana, style_id).await?; + let audio_query = &self.audio_query_from_kana(kana, style_id)?; self.synthesis(audio_query, style_id, &SynthesisOptions::from(options)) - .await } /// 日本語のテキストから音声合成を行う。 - pub async fn tts( - &self, - text: &str, - style_id: StyleId, - options: &TtsOptions, - ) -> Result> { - let audio_query = &self.audio_query(text, style_id).await?; + fn tts(&self, text: &str, style_id: StyleId, options: &TtsOptions) -> Result> { + let audio_query = &self.audio_query(text, style_id)?; self.synthesis(audio_query, style_id, &SynthesisOptions::from(options)) - .await } } -// FIXME: `async`を剥がしたのち`pub trait PerformInference`(sealed)に切り出し、それを -// `crate::__internal`から外に出す -#[doc(hidden)] -impl Synthesizer { - pub async fn predict_duration( +impl PerformInference for Synthesizer { + fn predict_duration(&self, phoneme_vector: &[i64], style_id: StyleId) -> Result> { + self.0.predict_duration(phoneme_vector, style_id) + } + + fn predict_intonation( &self, - phoneme_vector: &[i64], + length: usize, + vowel_phoneme_vector: &[i64], + consonant_phoneme_vector: &[i64], + start_accent_vector: &[i64], + end_accent_vector: &[i64], + start_accent_phrase_vector: &[i64], + end_accent_phrase_vector: &[i64], style_id: StyleId, ) -> Result> { + self.0.predict_intonation( + length, + vowel_phoneme_vector, + consonant_phoneme_vector, + start_accent_vector, + end_accent_vector, + start_accent_phrase_vector, + end_accent_phrase_vector, + style_id, + ) + } + + fn decode( + &self, + length: usize, + phoneme_size: usize, + f0: &[f32], + phoneme_vector: &[f32], + style_id: StyleId, + ) -> Result> { + self.0 + .decode(length, phoneme_size, f0, phoneme_vector, style_id) + } +} + +#[ext(PerformInference)] +impl blocking::Synthesizer { + /// `predict_duration`を実行する。 + /// + /// # Performance + /// + /// CPU-boundな操作であるため、非同期ランタイム上では直接実行されるべきではない。 + pub fn predict_duration(&self, phoneme_vector: &[i64], style_id: StyleId) -> Result> { // FIXME: `Status::ids_for`があるため、ここは不要なはず if !self.status.validate_speaker_id(style_id) { return Err(ErrorRepr::StyleNotFound { style_id }.into()); @@ -874,16 +1046,13 @@ impl Synthesizer { let PredictDurationOutput { phoneme_length: output, - } = self - .status - .run_session( - &model_id, - PredictDurationInput { - phoneme_list: ndarray::arr1(phoneme_vector), - speaker_id: ndarray::arr1(&[model_inner_id.raw_id().into()]), - }, - ) - .await?; + } = self.status.run_session( + &model_id, + PredictDurationInput { + phoneme_list: ndarray::arr1(phoneme_vector), + speaker_id: ndarray::arr1(&[model_inner_id.raw_id().into()]), + }, + )?; let mut output = output.into_raw_vec(); for output_item in output.iter_mut() { @@ -897,8 +1066,13 @@ impl Synthesizer { const PHONEME_LENGTH_MINIMAL: f32 = 0.01; } + /// `predict_intonation`を実行する。 + /// + /// # Performance + /// + /// CPU-boundな操作であるため、非同期ランタイム上では直接実行されるべきではない。 #[allow(clippy::too_many_arguments)] - pub async fn predict_intonation( + pub fn predict_intonation( &self, length: usize, vowel_phoneme_vector: &[i64], @@ -916,27 +1090,29 @@ impl Synthesizer { let (model_id, model_inner_id) = self.status.ids_for(style_id)?; - let PredictIntonationOutput { f0_list: output } = self - .status - .run_session( - &model_id, - PredictIntonationInput { - length: ndarray::arr0(length as i64), - vowel_phoneme_list: ndarray::arr1(vowel_phoneme_vector), - consonant_phoneme_list: ndarray::arr1(consonant_phoneme_vector), - start_accent_list: ndarray::arr1(start_accent_vector), - end_accent_list: ndarray::arr1(end_accent_vector), - start_accent_phrase_list: ndarray::arr1(start_accent_phrase_vector), - end_accent_phrase_list: ndarray::arr1(end_accent_phrase_vector), - speaker_id: ndarray::arr1(&[model_inner_id.raw_id().into()]), - }, - ) - .await?; + let PredictIntonationOutput { f0_list: output } = self.status.run_session( + &model_id, + PredictIntonationInput { + length: ndarray::arr0(length as i64), + vowel_phoneme_list: ndarray::arr1(vowel_phoneme_vector), + consonant_phoneme_list: ndarray::arr1(consonant_phoneme_vector), + start_accent_list: ndarray::arr1(start_accent_vector), + end_accent_list: ndarray::arr1(end_accent_vector), + start_accent_phrase_list: ndarray::arr1(start_accent_phrase_vector), + end_accent_phrase_list: ndarray::arr1(end_accent_phrase_vector), + speaker_id: ndarray::arr1(&[model_inner_id.raw_id().into()]), + }, + )?; Ok(output.into_raw_vec()) } - pub async fn decode( + /// `decode`を実行する。 + /// + /// # Performance + /// + /// CPU/GPU-boundな操作であるため、非同期ランタイム上では直接実行されるべきではない。 + pub fn decode( &self, length: usize, phoneme_size: usize, @@ -966,21 +1142,18 @@ impl Synthesizer { padding_size, ); - let DecodeOutput { wave: output } = self - .status - .run_session( - &model_id, - DecodeInput { - f0: ndarray::arr1(&f0_with_padding) - .into_shape([length_with_padding, 1]) - .unwrap(), - phoneme: ndarray::arr1(&phoneme_with_padding) - .into_shape([length_with_padding, phoneme_size]) - .unwrap(), - speaker_id: ndarray::arr1(&[model_inner_id.raw_id().into()]), - }, - ) - .await?; + let DecodeOutput { wave: output } = self.status.run_session( + &model_id, + DecodeInput { + f0: ndarray::arr1(&f0_with_padding) + .into_shape([length_with_padding, 1]) + .unwrap(), + phoneme: ndarray::arr1(&phoneme_with_padding) + .into_shape([length_with_padding, phoneme_size]) + .unwrap(), + speaker_id: ndarray::arr1(&[model_inner_id.raw_id().into()]), + }, + )?; return Ok(trim_padding_from_output( output.into_raw_vec(), @@ -1186,6 +1359,23 @@ impl AudioQueryModel { } } +mod blocking { + use std::sync::Arc; + + use crate::{ + engine::OpenJtalk, + infer::{domain::InferenceDomainImpl, status::Status}, + }; + + use super::InferenceRuntimeImpl; + + pub(super) struct Synthesizer { + pub(super) status: Status, + pub(super) open_jtalk: Arc, + pub(super) use_gpu: bool, + } +} + #[cfg(test)] mod tests { @@ -1284,9 +1474,7 @@ mod tests { 30, 35, 14, 23, 7, 21, 14, 43, 30, 30, 23, 30, 35, 30, 0, ]; - let result = syntesizer - .predict_duration(&phoneme_vector, StyleId::new(1)) - .await; + let result = syntesizer.predict_duration(&phoneme_vector, StyleId::new(1)); assert!(result.is_ok(), "{result:?}"); assert_eq!(result.unwrap().len(), phoneme_vector.len()); @@ -1316,18 +1504,16 @@ mod tests { let start_accent_phrase_vector = [0, 1, 0, 0, 0]; let end_accent_phrase_vector = [0, 0, 0, 1, 0]; - let result = syntesizer - .predict_intonation( - vowel_phoneme_vector.len(), - &vowel_phoneme_vector, - &consonant_phoneme_vector, - &start_accent_vector, - &end_accent_vector, - &start_accent_phrase_vector, - &end_accent_phrase_vector, - StyleId::new(1), - ) - .await; + let result = syntesizer.predict_intonation( + vowel_phoneme_vector.len(), + &vowel_phoneme_vector, + &consonant_phoneme_vector, + &start_accent_vector, + &end_accent_vector, + &start_accent_phrase_vector, + &end_accent_phrase_vector, + StyleId::new(1), + ); assert!(result.is_ok(), "{result:?}"); assert_eq!(result.unwrap().len(), vowel_phoneme_vector.len()); @@ -1371,9 +1557,7 @@ mod tests { set_one(30, 45..60); set_one(0, 60..69); - let result = syntesizer - .decode(F0_LENGTH, PHONEME_SIZE, &f0, &phoneme, StyleId::new(1)) - .await; + let result = syntesizer.decode(F0_LENGTH, PHONEME_SIZE, &f0, &phoneme, StyleId::new(1)); assert!(result.is_ok(), "{result:?}"); assert_eq!(result.unwrap().len(), F0_LENGTH * 256); diff --git a/crates/voicevox_core/src/task.rs b/crates/voicevox_core/src/task.rs new file mode 100644 index 000000000..951e3c19e --- /dev/null +++ b/crates/voicevox_core/src/task.rs @@ -0,0 +1,16 @@ +use std::panic; + +/// ブロッキング操作を非同期化する。 +/// +/// # Panics +/// +/// - `f`がパニックした場合、パニックがそのままunwindされる。 +/// - tokioのランタイムの都合で`f`の実行が"cancel"された場合パニックする。 +pub(crate) async fn asyncify R + Send + 'static, R: Send + 'static>(f: F) -> R { + tokio::task::spawn_blocking(f) + .await + .unwrap_or_else(|err| match err.try_into_panic() { + Ok(panic) => panic::resume_unwind(panic), + Err(err) => panic!("{err}"), // FIXME: エラーとして回収する + }) +} diff --git a/crates/voicevox_core_c_api/src/compatible_engine.rs b/crates/voicevox_core_c_api/src/compatible_engine.rs index dd8ce8e94..8f2d7c5cc 100644 --- a/crates/voicevox_core_c_api/src/compatible_engine.rs +++ b/crates/voicevox_core_c_api/src/compatible_engine.rs @@ -3,7 +3,7 @@ use std::{collections::BTreeMap, sync::Arc}; use super::*; use libc::c_int; -use voicevox_core::{OpenJtalk, StyleId, VoiceModel}; +use voicevox_core::{OpenJtalk, StyleId, VoiceModel, __internal::interop::PerformInference as _}; macro_rules! ensure_initialized { ($synthesizer:expr $(,)?) => { @@ -104,6 +104,8 @@ fn set_message(message: &str) { #[no_mangle] pub extern "C" fn initialize(use_gpu: bool, cpu_num_threads: c_int, load_all_models: bool) -> bool { + // FIXME: ここはもう`RUNTIME.block_on`で包む必要は無くなっているのだが、ロガーの設定を`RUNTIME` + // で行っているという構造になってしまっているので、外すとロガーの初期化が遅れてしまでう let result = RUNTIME.block_on(async { let synthesizer = voicevox_core::Synthesizer::new( Arc::new(OpenJtalk::new_without_dic()), @@ -197,10 +199,10 @@ pub extern "C" fn yukarin_s_forward( output: *mut f32, ) -> bool { let synthesizer = &*lock_synthesizer(); - let result = RUNTIME.block_on(ensure_initialized!(synthesizer).predict_duration( + let result = ensure_initialized!(synthesizer).predict_duration( unsafe { std::slice::from_raw_parts_mut(phoneme_list, length as usize) }, StyleId::new(unsafe { *speaker_id as u32 }), - )); + ); match result { Ok(output_vec) => { let output_slice = unsafe { std::slice::from_raw_parts_mut(output, length as usize) }; @@ -227,7 +229,7 @@ pub extern "C" fn yukarin_sa_forward( output: *mut f32, ) -> bool { let synthesizer = &*lock_synthesizer(); - let result = RUNTIME.block_on(ensure_initialized!(synthesizer).predict_intonation( + let result = ensure_initialized!(synthesizer).predict_intonation( length as usize, unsafe { std::slice::from_raw_parts(vowel_phoneme_list, length as usize) }, unsafe { std::slice::from_raw_parts(consonant_phoneme_list, length as usize) }, @@ -236,7 +238,7 @@ pub extern "C" fn yukarin_sa_forward( unsafe { std::slice::from_raw_parts(start_accent_phrase_list, length as usize) }, unsafe { std::slice::from_raw_parts(end_accent_phrase_list, length as usize) }, StyleId::new(unsafe { *speaker_id as u32 }), - )); + ); match result { Ok(output_vec) => { let output_slice = unsafe { std::slice::from_raw_parts_mut(output, length as usize) }; @@ -262,13 +264,13 @@ pub extern "C" fn decode_forward( let length = length as usize; let phoneme_size = phoneme_size as usize; let synthesizer = &*lock_synthesizer(); - let result = RUNTIME.block_on(ensure_initialized!(synthesizer).decode( + let result = ensure_initialized!(synthesizer).decode( length, phoneme_size, unsafe { std::slice::from_raw_parts(f0, length) }, unsafe { std::slice::from_raw_parts(phoneme, phoneme_size * length) }, StyleId::new(unsafe { *speaker_id as u32 }), - )); + ); match result { Ok(output_vec) => { let output_slice = unsafe { std::slice::from_raw_parts_mut(output, length * 256) }; diff --git a/crates/voicevox_core_java_api/src/synthesizer.rs b/crates/voicevox_core_java_api/src/synthesizer.rs index 9d3cb9f9f..5e2cbb8d2 100644 --- a/crates/voicevox_core_java_api/src/synthesizer.rs +++ b/crates/voicevox_core_java_api/src/synthesizer.rs @@ -53,7 +53,7 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsNew<'loca .get_rust_field::<_, _, Arc>(&open_jtalk, "handle")? .clone(); let internal = voicevox_core::Synthesizer::new(open_jtalk, Box::leak(Box::new(options)))?; - env.set_rust_field(&this, "handle", Arc::new(internal))?; + env.set_rust_field(&this, "handle", internal)?; Ok(()) }) } @@ -64,7 +64,7 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsIsGpuMode ) -> jboolean { throw_if_err(env, false, |env| { let internal = env - .get_rust_field::<_, _, Arc>(&this, "handle")? + .get_rust_field::<_, _, voicevox_core::Synthesizer>(&this, "handle")? .clone(); Ok(internal.is_gpu_mode()) @@ -78,7 +78,7 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsGetMetasJ ) -> jobject { throw_if_err(env, std::ptr::null_mut(), |env| { let internal = env - .get_rust_field::<_, _, Arc>(&this, "handle")? + .get_rust_field::<_, _, voicevox_core::Synthesizer>(&this, "handle")? .clone(); let metas_json = serde_json::to_string(&internal.metas()).expect("should not fail"); @@ -100,7 +100,7 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsLoadVoice .get_rust_field::<_, _, Arc>(&model, "handle")? .clone(); let internal = env - .get_rust_field::<_, _, Arc>(&this, "handle")? + .get_rust_field::<_, _, voicevox_core::Synthesizer>(&this, "handle")? .clone(); RUNTIME.block_on(internal.load_voice_model(&model))?; Ok(()) @@ -117,7 +117,7 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsUnloadVoi let model_id: String = env.get_string(&model_id)?.into(); let internal = env - .get_rust_field::<_, _, Arc>(&this, "handle")? + .get_rust_field::<_, _, voicevox_core::Synthesizer>(&this, "handle")? .clone(); internal.unload_voice_model(&voicevox_core::VoiceModelId::new(model_id))?; @@ -138,7 +138,7 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsIsLoadedV let model_id: String = env.get_string(&model_id)?.into(); let internal = env - .get_rust_field::<_, _, Arc>(&this, "handle")? + .get_rust_field::<_, _, voicevox_core::Synthesizer>(&this, "handle")? .clone(); let is_loaded = internal.is_loaded_voice_model(&voicevox_core::VoiceModelId::new(model_id)); @@ -162,7 +162,7 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsAudioQuer let style_id = style_id as u32; let internal = env - .get_rust_field::<_, _, Arc>(&this, "handle")? + .get_rust_field::<_, _, voicevox_core::Synthesizer>(&this, "handle")? .clone(); let audio_query = RUNTIME.block_on( @@ -189,7 +189,7 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsAudioQuer let style_id = style_id as u32; let internal = env - .get_rust_field::<_, _, Arc>(&this, "handle")? + .get_rust_field::<_, _, voicevox_core::Synthesizer>(&this, "handle")? .clone(); let audio_query = @@ -217,7 +217,7 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsAccentPhr let style_id = style_id as u32; let internal = env - .get_rust_field::<_, _, Arc>(&this, "handle")? + .get_rust_field::<_, _, voicevox_core::Synthesizer>(&this, "handle")? .clone(); let accent_phrases = RUNTIME.block_on( @@ -244,7 +244,7 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsAccentPhr let style_id = style_id as u32; let internal = env - .get_rust_field::<_, _, Arc>(&this, "handle")? + .get_rust_field::<_, _, voicevox_core::Synthesizer>(&this, "handle")? .clone(); let accent_phrases = RUNTIME.block_on( @@ -273,7 +273,7 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsReplaceMo let style_id = style_id as u32; let internal = env - .get_rust_field::<_, _, Arc>(&this, "handle")? + .get_rust_field::<_, _, voicevox_core::Synthesizer>(&this, "handle")? .clone(); let replaced_accent_phrases = RUNTIME.block_on( @@ -303,7 +303,7 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsReplacePh let style_id = style_id as u32; let internal = env - .get_rust_field::<_, _, Arc>(&this, "handle")? + .get_rust_field::<_, _, voicevox_core::Synthesizer>(&this, "handle")? .clone(); let replaced_accent_phrases = { @@ -334,7 +334,7 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsReplaceMo let style_id = style_id as u32; let internal = env - .get_rust_field::<_, _, Arc>(&this, "handle")? + .get_rust_field::<_, _, voicevox_core::Synthesizer>(&this, "handle")? .clone(); let replaced_accent_phrases = RUNTIME.block_on( @@ -363,7 +363,7 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsSynthesis let style_id = style_id as u32; let internal = env - .get_rust_field::<_, _, Arc>(&this, "handle")? + .get_rust_field::<_, _, voicevox_core::Synthesizer>(&this, "handle")? .clone(); let wave = { @@ -397,7 +397,7 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsTtsFromKa let style_id = style_id as u32; let internal = env - .get_rust_field::<_, _, Arc>(&this, "handle")? + .get_rust_field::<_, _, voicevox_core::Synthesizer>(&this, "handle")? .clone(); let wave = { @@ -431,7 +431,7 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsTts<'loca let style_id = style_id as u32; let internal = env - .get_rust_field::<_, _, Arc>(&this, "handle")? + .get_rust_field::<_, _, voicevox_core::Synthesizer>(&this, "handle")? .clone(); let wave = { diff --git a/crates/voicevox_core_python_api/src/lib.rs b/crates/voicevox_core_python_api/src/lib.rs index e96a3d8c4..86a64e394 100644 --- a/crates/voicevox_core_python_api/src/lib.rs +++ b/crates/voicevox_core_python_api/src/lib.rs @@ -144,7 +144,7 @@ impl OpenJtalk { #[pyclass] struct Synthesizer { - synthesizer: Closable, Self>, + synthesizer: Closable, } #[pymethods] @@ -167,7 +167,7 @@ impl Synthesizer { cpu_num_threads, }, ); - let synthesizer = Python::with_gil(|py| synthesizer.into_py_result(py))?.into(); + let synthesizer = Python::with_gil(|py| synthesizer.into_py_result(py))?; let synthesizer = Closable::new(synthesizer); Ok(Self { synthesizer }) }