From 510f4b65780393982af6f80f6dab922e22d9f473 Mon Sep 17 00:00:00 2001 From: Jordi Mas Date: Mon, 23 Dec 2024 13:30:30 +0100 Subject: [PATCH] Better logic to update the voices form assigned_voice and speaker_id --- open_dubbing/dubbing.py | 1 + open_dubbing/text_to_speech.py | 21 ++++++++--- open_dubbing/utterance.py | 39 +++++++++++++++++--- tests/text_to_speech_test.py | 62 ++++++++++++++++++++++++++++++-- tests/utterance_test.py | 65 +++++++++++++++++++++++++++++----- 5 files changed, 168 insertions(+), 20 deletions(-) diff --git a/open_dubbing/dubbing.py b/open_dubbing/dubbing.py index d0923e1..442c216 100644 --- a/open_dubbing/dubbing.py +++ b/open_dubbing/dubbing.py @@ -393,6 +393,7 @@ def update(self): ) modified_utterances = self.tts.update_utterance_metadata( + utterance=utterance, utterance_metadata=modified_utterances, assigned_voices=assigned_voices, ) diff --git a/open_dubbing/text_to_speech.py b/open_dubbing/text_to_speech.py index 91e2659..77927b9 100644 --- a/open_dubbing/text_to_speech.py +++ b/open_dubbing/text_to_speech.py @@ -21,6 +21,7 @@ from pydub import AudioSegment from open_dubbing.ffmpeg import FFmpeg +from open_dubbing.utterance import Utterance class Voice(NamedTuple): @@ -106,6 +107,7 @@ def _add_text_to_speech_properties( def update_utterance_metadata( self, *, + utterance: Utterance | None = None, utterance_metadata: Sequence[Mapping[str, str | float]], assigned_voices: Mapping[str, str] | None, ) -> Sequence[Mapping[str, str | float]]: @@ -113,11 +115,22 @@ def update_utterance_metadata( updated_utterance_metadata = [] for metadata_item in utterance_metadata: new_utterance = metadata_item.copy() - speaker_id = new_utterance.get("speaker_id") - new_utterance["assigned_voice"] = assigned_voices.get(speaker_id) - new_utterance = self._add_text_to_speech_properties( - utterance_metadata=new_utterance + + fields = ( + utterance.get_modified_utterance_fields(new_utterance) + if utterance + else [] ) + # If "assigned_voice" has changed we give it priority and not overwrite it + # by recalculating from speaker_id/gender + if not utterance or ( + "speaker_id" in fields and "assigned_voice" not in fields + ): + speaker_id = new_utterance.get("speaker_id") + new_utterance["assigned_voice"] = assigned_voices.get(speaker_id) + new_utterance = self._add_text_to_speech_properties( + utterance_metadata=new_utterance + ) updated_utterance_metadata.append(new_utterance) return updated_utterance_metadata diff --git a/open_dubbing/utterance.py b/open_dubbing/utterance.py index e53a4c6..24f608e 100644 --- a/open_dubbing/utterance.py +++ b/open_dubbing/utterance.py @@ -102,11 +102,25 @@ def save_utterances( except Exception as e: logging.warning(f"Error saving utterance metadata: {e}") + def _get_utterance_fields_to_hash(self, utterance): + filtered_fields = { + key: value for key, value in utterance.items() if not key.startswith("_") + } + return filtered_fields + def _hash_utterances(self, utterance_metadata): for utterance in utterance_metadata: - dict_str = json.dumps(utterance, sort_keys=True) + filtered_fields = self._get_utterance_fields_to_hash(utterance) + dict_str = json.dumps(filtered_fields, sort_keys=True) _hash = hashlib.sha256(dict_str.encode()).hexdigest() - utterance["hash"] = _hash + utterance["_hash"] = _hash + + for field in ["assigned_voice", "speaker_id"]: + value = utterance.get(field) + if value: + utterance[f"_{field}_hash"] = hashlib.sha256( + value.encode() + ).hexdigest() return utterance_metadata @@ -129,13 +143,28 @@ def get_files_paths(self, utterance_metadata) -> Tuple[List[str], List[str]]: return paths, dubbed_paths + def get_modified_utterance_fields(self, utterance): + modified = [] + for field in utterance: + field_hash = utterance.get(f"_{field}_hash") + if not field_hash: + continue + + field_value = utterance[field] + current_hash = hashlib.sha256(field_value.encode()).hexdigest() + + if current_hash != field_hash: + modified.append(field) + + return modified + def get_modified_utterances(self, utterance_metadata): modified = [] for utterance in utterance_metadata: - _hash_utterance = utterance["hash"] - del utterance["hash"] - dict_str = json.dumps(utterance, sort_keys=True) + _hash_utterance = utterance["_hash"] + filtered_fields = self._get_utterance_fields_to_hash(utterance) + dict_str = json.dumps(filtered_fields, sort_keys=True) _hash = hashlib.sha256(dict_str.encode()).hexdigest() if _hash_utterance != _hash: modified.append(utterance) diff --git a/tests/text_to_speech_test.py b/tests/text_to_speech_test.py index 039bd24..b132584 100644 --- a/tests/text_to_speech_test.py +++ b/tests/text_to_speech_test.py @@ -17,7 +17,7 @@ import tempfile from typing import List -from unittest.mock import patch +from unittest.mock import Mock, patch import pytest @@ -126,7 +126,7 @@ def test_dub_utterances_with_speeds( "start": 0, "end": 5, "translated_text": "Hello world", - "speed": 1.0, # Initially set speed to 1.0 + "speed": 1.0, "path": "some/path/file.mp3", } ] @@ -292,3 +292,61 @@ def test_assign_voices(self): target_language_region="IN", ) assert {1: "Voice3"} == results + + def _get_update_utterance_metadata(self): + return [ + { + "speaker_id": "2", + "start": 0, + "assigned_voice": "Voice0", + "end": 5, + "translated_text": "Hello world", + "speed": 1.0, + "path": "some/path/file.mp3", + } + ] + + def test_update_utterance_metadata_assign_voice_from_speaker(self): + voices = {"1": "Voice1", "2": "Voice2"} + + utterance_metadata = self._get_update_utterance_metadata() + tts = TextToSpeechUT() + updated_utterances = tts.update_utterance_metadata( + utterance=None, + utterance_metadata=utterance_metadata, + assigned_voices=voices, + ) + + assert "Voice2" == updated_utterances[0]["assigned_voice"] + + def test_update_utterance_metadata_assign_voice_from_speaker_id(self): + voices = {"1": "Voice1", "2": "Voice2"} + + utterance_metadata = self._get_update_utterance_metadata() + utterance = Mock() + utterance.get_modified_utterance_fields.return_value = {"speaker_id"} + + tts = TextToSpeechUT() + updated_utterances = tts.update_utterance_metadata( + utterance=utterance, + utterance_metadata=utterance_metadata, + assigned_voices=voices, + ) + + assert "Voice2" == updated_utterances[0]["assigned_voice"] + + def test_update_utterance_metadata_assign_voice_from_assigned_voice(self): + voices = {"1": "Voice1", "2": "Voice2"} + + utterance_metadata = self._get_update_utterance_metadata() + utterance = Mock() + utterance.get_modified_utterance_fields.return_value = {"assigned_voice"} + + tts = TextToSpeechUT() + updated_utterances = tts.update_utterance_metadata( + utterance=utterance, + utterance_metadata=utterance_metadata, + assigned_voices=voices, + ) + + assert "Voice0" == updated_utterances[0]["assigned_voice"] diff --git a/tests/utterance_test.py b/tests/utterance_test.py index 3e758ab..48df6b8 100644 --- a/tests/utterance_test.py +++ b/tests/utterance_test.py @@ -50,13 +50,13 @@ def testrun_save_utterance(self): "id": 1, "start": 1.26, "end": 3.94, - "hash": "26d514d9ce21021f51bd010d9946db0f31555ef7145067d4fe5a3b1bdcd84ce7", + "_hash": "26d514d9ce21021f51bd010d9946db0f31555ef7145067d4fe5a3b1bdcd84ce7", }, { "id": 2, "start": 5.24, "end": 6.629, - "hash": "157dc7fb355c7dc13a0ea687e9fd4a6f6c5c03526a959a64dfe1fa7562fedff4", + "_hash": "157dc7fb355c7dc13a0ea687e9fd4a6f6c5c03526a959a64dfe1fa7562fedff4", }, ], "metadata": { @@ -70,6 +70,7 @@ def test_hash_utterances(self): { "start": 1.26, "end": 3.94, + "_private_not_hashed": "0", }, { "start": 5.24, @@ -86,12 +87,13 @@ def test_hash_utterances(self): { "start": 1.26, "end": 3.94, - "hash": "2fa6f80e0c81fb8e142f2dbbad0bceff7c21a031833b5752bc1cfd799f6b3bc6", + "_private_not_hashed": "0", + "_hash": "2fa6f80e0c81fb8e142f2dbbad0bceff7c21a031833b5752bc1cfd799f6b3bc6", }, { "start": 5.24, "end": 6.629, - "hash": "34cd5da78cb163ad18996aefffcfeae864727257defc7ae68818a245ca269951", + "_hash": "34cd5da78cb163ad18996aefffcfeae864727257defc7ae68818a245ca269951", }, ] @@ -101,13 +103,13 @@ def test_get_modified_utterances(self): "id": 1, "start": 1.26, "end": 3.94, - "hash": "26d514d9ce21021f51bd010d9946db0f31555ef7145067d4fe5a3b1bdcd84ce7", + "_hash": "26d514d9ce21021f51bd010d9946db0f31555ef7145067d4fe5a3b1bdcd84ce7", }, { "id": 2, "start": 5.25, "end": 6.629, - "hash": "157dc7fb355c7dc13a0ea687e9fd4a6f6c5c03526a959a64dfe1fa7562fedff4", + "_hash": "157dc7fb355c7dc13a0ea687e9fd4a6f6c5c03526a959a64dfe1fa7562fedff4", }, ] dubbing = Utterance( @@ -117,6 +119,7 @@ def test_get_modified_utterances(self): modified = dubbing.get_modified_utterances(utterances) assert 1 == len(modified) + assert 2 == modified[0]["id"] def test_get_without_empty_blocks(self): utterances = [ @@ -139,6 +142,7 @@ def test_get_without_empty_blocks(self): modified = dubbing.get_without_empty_blocks(utterances) assert 1 == len(modified) + assert "Hola" == modified[0]["text"] def test_add_unique_ids(self): @@ -178,7 +182,7 @@ def _get_master_utterances(self): "assigned_voice": "ca-ES-EnricNeural", "speed": 1.0, "dubbed_path": "output/jordi.central.edge.update/dubbed_chunk_1.26284375_3.94596875.mp3", - "hash": "b01b399ac50f80f87e704918e290ffc5ee0a1962683ba946c627124ea903480d", + "_hash": "b01b399ac50f80f87e704918e290ffc5ee0a1962683ba946c627124ea903480d", }, { "id": 2, @@ -193,7 +197,7 @@ def _get_master_utterances(self): "assigned_voice": "ca-ES-EnricNeural", "speed": 1.0, "dubbed_path": "output/jordi.central.edge.update/dubbed_chunk_5.24534375_6.629093750000001.mp3", - "hash": "629484afdecb7641e35d686d6348cee4445611690f2f77831e892d52c3128bdd", + "_hash": "629484afdecb7641e35d686d6348cee4445611690f2f77831e892d52c3128bdd", }, ] @@ -241,7 +245,7 @@ def test_update_utterances_operation_update(self): "assigned_voice": "ca-ES-EnricNeural", "speed": 1.0, "dubbed_path": "output/jordi.central.edge.update/dubbed_chunk_5.24534375_6.629093750000001.mp3", - "hash": "629484afdecb7641e35d686d6348cee4445611690f2f77831e892d52c3128bdd", + "_hash": "629484afdecb7641e35d686d6348cee4445611690f2f77831e892d52c3128bdd", } def test_load_utterances(self): @@ -264,3 +268,46 @@ def test_load_utterances(self): "original_subtitles": False, "dubbed_subtitles": False, } + + def _get_utterance(self): + return { + "id": 2, + "start": 5.24534375, + "end": 6.64596875, + "speaker_id": "SPEAKER_00", + "path": "output/jordi.voices/chunk_5.24534375_6.64596875.mp3", + "text": "I am from Barcelona.", + "for_dubbing": "true", + "gender": "Male", + "translated_text": "Soc de Barcelona.", + "assigned_voice": "2", + "speed": 1.0, + "dubbed_path": "output/jordi.voices/dubbed_chunk_5.24534375_6.64596875.mp3", + "_hash": "ea1d02c92026bc8cd6144a6500489333bdf0b58368817ea5116189d101c1fe9e", + "_assigned_voice_hash": "d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35", + "_speaker_id_hash": "bf4f81ea701e475a4268bb9f36ddb43d7d5c0dbf4578fd5a24c3a8b5a375b4c9", + } + + def test_get_modified_utterances_with_field_hashes(self): + utterances = [self._get_utterance()] + dubbing = Utterance( + target_language="cat", + output_directory=None, + ) + + modified = dubbing.get_modified_utterances(utterances) + assert 0 == len(modified) + + def test_get_modified_utterance_fields_none(self): + utterance = self._get_utterance() + u = Utterance(target_language="cat", output_directory="") + fields = u.get_modified_utterance_fields(utterance) + assert len(fields) == 0 + + def test_get_modified_utterance_fields_speaker_id(self): + utterance = self._get_utterance() + utterance["speaker_id"] = "SPEAKER_01" + u = Utterance(target_language="cat", output_directory="") + fields = u.get_modified_utterance_fields(utterance) + assert len(fields) == 1 + assert "speaker_id" == fields[0]