Skip to content

Commit

Permalink
Better logic to update the voices form assigned_voice and speaker_id
Browse files Browse the repository at this point in the history
  • Loading branch information
jordimas committed Dec 23, 2024
1 parent 4fd0035 commit 510f4b6
Show file tree
Hide file tree
Showing 5 changed files with 168 additions and 20 deletions.
1 change: 1 addition & 0 deletions open_dubbing/dubbing.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,7 @@ def update(self):
)

modified_utterances = self.tts.update_utterance_metadata(
utterance=utterance,
utterance_metadata=modified_utterances,
assigned_voices=assigned_voices,
)
Expand Down
21 changes: 17 additions & 4 deletions open_dubbing/text_to_speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from pydub import AudioSegment

from open_dubbing.ffmpeg import FFmpeg
from open_dubbing.utterance import Utterance


class Voice(NamedTuple):
Expand Down Expand Up @@ -106,18 +107,30 @@ def _add_text_to_speech_properties(
def update_utterance_metadata(
self,
*,
utterance: Utterance | None = None,
utterance_metadata: Sequence[Mapping[str, str | float]],
assigned_voices: Mapping[str, str] | None,
) -> Sequence[Mapping[str, str | float]]:
"""Updates utterance metadata with assigned voices."""
updated_utterance_metadata = []
for metadata_item in utterance_metadata:
new_utterance = metadata_item.copy()
speaker_id = new_utterance.get("speaker_id")
new_utterance["assigned_voice"] = assigned_voices.get(speaker_id)
new_utterance = self._add_text_to_speech_properties(
utterance_metadata=new_utterance

fields = (
utterance.get_modified_utterance_fields(new_utterance)
if utterance
else []
)
# If "assigned_voice" has changed we give it priority and not overwrite it
# by recalculating from speaker_id/gender
if not utterance or (
"speaker_id" in fields and "assigned_voice" not in fields
):
speaker_id = new_utterance.get("speaker_id")
new_utterance["assigned_voice"] = assigned_voices.get(speaker_id)
new_utterance = self._add_text_to_speech_properties(
utterance_metadata=new_utterance
)
updated_utterance_metadata.append(new_utterance)
return updated_utterance_metadata

Expand Down
39 changes: 34 additions & 5 deletions open_dubbing/utterance.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,11 +102,25 @@ def save_utterances(
except Exception as e:
logging.warning(f"Error saving utterance metadata: {e}")

def _get_utterance_fields_to_hash(self, utterance):
filtered_fields = {
key: value for key, value in utterance.items() if not key.startswith("_")
}
return filtered_fields

def _hash_utterances(self, utterance_metadata):
for utterance in utterance_metadata:
dict_str = json.dumps(utterance, sort_keys=True)
filtered_fields = self._get_utterance_fields_to_hash(utterance)
dict_str = json.dumps(filtered_fields, sort_keys=True)
_hash = hashlib.sha256(dict_str.encode()).hexdigest()
utterance["hash"] = _hash
utterance["_hash"] = _hash

for field in ["assigned_voice", "speaker_id"]:
value = utterance.get(field)
if value:
utterance[f"_{field}_hash"] = hashlib.sha256(
value.encode()
).hexdigest()

return utterance_metadata

Expand All @@ -129,13 +143,28 @@ def get_files_paths(self, utterance_metadata) -> Tuple[List[str], List[str]]:

return paths, dubbed_paths

def get_modified_utterance_fields(self, utterance):
modified = []
for field in utterance:
field_hash = utterance.get(f"_{field}_hash")
if not field_hash:
continue

field_value = utterance[field]
current_hash = hashlib.sha256(field_value.encode()).hexdigest()

if current_hash != field_hash:
modified.append(field)

return modified

def get_modified_utterances(self, utterance_metadata):
modified = []
for utterance in utterance_metadata:
_hash_utterance = utterance["hash"]
del utterance["hash"]
dict_str = json.dumps(utterance, sort_keys=True)
_hash_utterance = utterance["_hash"]
filtered_fields = self._get_utterance_fields_to_hash(utterance)

dict_str = json.dumps(filtered_fields, sort_keys=True)
_hash = hashlib.sha256(dict_str.encode()).hexdigest()
if _hash_utterance != _hash:
modified.append(utterance)
Expand Down
62 changes: 60 additions & 2 deletions tests/text_to_speech_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import tempfile

from typing import List
from unittest.mock import patch
from unittest.mock import Mock, patch

import pytest

Expand Down Expand Up @@ -126,7 +126,7 @@ def test_dub_utterances_with_speeds(
"start": 0,
"end": 5,
"translated_text": "Hello world",
"speed": 1.0, # Initially set speed to 1.0
"speed": 1.0,
"path": "some/path/file.mp3",
}
]
Expand Down Expand Up @@ -292,3 +292,61 @@ def test_assign_voices(self):
target_language_region="IN",
)
assert {1: "Voice3"} == results

def _get_update_utterance_metadata(self):
return [
{
"speaker_id": "2",
"start": 0,
"assigned_voice": "Voice0",
"end": 5,
"translated_text": "Hello world",
"speed": 1.0,
"path": "some/path/file.mp3",
}
]

def test_update_utterance_metadata_assign_voice_from_speaker(self):
voices = {"1": "Voice1", "2": "Voice2"}

utterance_metadata = self._get_update_utterance_metadata()
tts = TextToSpeechUT()
updated_utterances = tts.update_utterance_metadata(
utterance=None,
utterance_metadata=utterance_metadata,
assigned_voices=voices,
)

assert "Voice2" == updated_utterances[0]["assigned_voice"]

def test_update_utterance_metadata_assign_voice_from_speaker_id(self):
voices = {"1": "Voice1", "2": "Voice2"}

utterance_metadata = self._get_update_utterance_metadata()
utterance = Mock()
utterance.get_modified_utterance_fields.return_value = {"speaker_id"}

tts = TextToSpeechUT()
updated_utterances = tts.update_utterance_metadata(
utterance=utterance,
utterance_metadata=utterance_metadata,
assigned_voices=voices,
)

assert "Voice2" == updated_utterances[0]["assigned_voice"]

def test_update_utterance_metadata_assign_voice_from_assigned_voice(self):
voices = {"1": "Voice1", "2": "Voice2"}

utterance_metadata = self._get_update_utterance_metadata()
utterance = Mock()
utterance.get_modified_utterance_fields.return_value = {"assigned_voice"}

tts = TextToSpeechUT()
updated_utterances = tts.update_utterance_metadata(
utterance=utterance,
utterance_metadata=utterance_metadata,
assigned_voices=voices,
)

assert "Voice0" == updated_utterances[0]["assigned_voice"]
65 changes: 56 additions & 9 deletions tests/utterance_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,13 @@ def testrun_save_utterance(self):
"id": 1,
"start": 1.26,
"end": 3.94,
"hash": "26d514d9ce21021f51bd010d9946db0f31555ef7145067d4fe5a3b1bdcd84ce7",
"_hash": "26d514d9ce21021f51bd010d9946db0f31555ef7145067d4fe5a3b1bdcd84ce7",
},
{
"id": 2,
"start": 5.24,
"end": 6.629,
"hash": "157dc7fb355c7dc13a0ea687e9fd4a6f6c5c03526a959a64dfe1fa7562fedff4",
"_hash": "157dc7fb355c7dc13a0ea687e9fd4a6f6c5c03526a959a64dfe1fa7562fedff4",
},
],
"metadata": {
Expand All @@ -70,6 +70,7 @@ def test_hash_utterances(self):
{
"start": 1.26,
"end": 3.94,
"_private_not_hashed": "0",
},
{
"start": 5.24,
Expand All @@ -86,12 +87,13 @@ def test_hash_utterances(self):
{
"start": 1.26,
"end": 3.94,
"hash": "2fa6f80e0c81fb8e142f2dbbad0bceff7c21a031833b5752bc1cfd799f6b3bc6",
"_private_not_hashed": "0",
"_hash": "2fa6f80e0c81fb8e142f2dbbad0bceff7c21a031833b5752bc1cfd799f6b3bc6",
},
{
"start": 5.24,
"end": 6.629,
"hash": "34cd5da78cb163ad18996aefffcfeae864727257defc7ae68818a245ca269951",
"_hash": "34cd5da78cb163ad18996aefffcfeae864727257defc7ae68818a245ca269951",
},
]

Expand All @@ -101,13 +103,13 @@ def test_get_modified_utterances(self):
"id": 1,
"start": 1.26,
"end": 3.94,
"hash": "26d514d9ce21021f51bd010d9946db0f31555ef7145067d4fe5a3b1bdcd84ce7",
"_hash": "26d514d9ce21021f51bd010d9946db0f31555ef7145067d4fe5a3b1bdcd84ce7",
},
{
"id": 2,
"start": 5.25,
"end": 6.629,
"hash": "157dc7fb355c7dc13a0ea687e9fd4a6f6c5c03526a959a64dfe1fa7562fedff4",
"_hash": "157dc7fb355c7dc13a0ea687e9fd4a6f6c5c03526a959a64dfe1fa7562fedff4",
},
]
dubbing = Utterance(
Expand All @@ -117,6 +119,7 @@ def test_get_modified_utterances(self):

modified = dubbing.get_modified_utterances(utterances)
assert 1 == len(modified)
assert 2 == modified[0]["id"]

def test_get_without_empty_blocks(self):
utterances = [
Expand All @@ -139,6 +142,7 @@ def test_get_without_empty_blocks(self):

modified = dubbing.get_without_empty_blocks(utterances)
assert 1 == len(modified)
assert "Hola" == modified[0]["text"]

def test_add_unique_ids(self):

Expand Down Expand Up @@ -178,7 +182,7 @@ def _get_master_utterances(self):
"assigned_voice": "ca-ES-EnricNeural",
"speed": 1.0,
"dubbed_path": "output/jordi.central.edge.update/dubbed_chunk_1.26284375_3.94596875.mp3",
"hash": "b01b399ac50f80f87e704918e290ffc5ee0a1962683ba946c627124ea903480d",
"_hash": "b01b399ac50f80f87e704918e290ffc5ee0a1962683ba946c627124ea903480d",
},
{
"id": 2,
Expand All @@ -193,7 +197,7 @@ def _get_master_utterances(self):
"assigned_voice": "ca-ES-EnricNeural",
"speed": 1.0,
"dubbed_path": "output/jordi.central.edge.update/dubbed_chunk_5.24534375_6.629093750000001.mp3",
"hash": "629484afdecb7641e35d686d6348cee4445611690f2f77831e892d52c3128bdd",
"_hash": "629484afdecb7641e35d686d6348cee4445611690f2f77831e892d52c3128bdd",
},
]

Expand Down Expand Up @@ -241,7 +245,7 @@ def test_update_utterances_operation_update(self):
"assigned_voice": "ca-ES-EnricNeural",
"speed": 1.0,
"dubbed_path": "output/jordi.central.edge.update/dubbed_chunk_5.24534375_6.629093750000001.mp3",
"hash": "629484afdecb7641e35d686d6348cee4445611690f2f77831e892d52c3128bdd",
"_hash": "629484afdecb7641e35d686d6348cee4445611690f2f77831e892d52c3128bdd",
}

def test_load_utterances(self):
Expand All @@ -264,3 +268,46 @@ def test_load_utterances(self):
"original_subtitles": False,
"dubbed_subtitles": False,
}

def _get_utterance(self):
return {
"id": 2,
"start": 5.24534375,
"end": 6.64596875,
"speaker_id": "SPEAKER_00",
"path": "output/jordi.voices/chunk_5.24534375_6.64596875.mp3",
"text": "I am from Barcelona.",
"for_dubbing": "true",
"gender": "Male",
"translated_text": "Soc de Barcelona.",
"assigned_voice": "2",
"speed": 1.0,
"dubbed_path": "output/jordi.voices/dubbed_chunk_5.24534375_6.64596875.mp3",
"_hash": "ea1d02c92026bc8cd6144a6500489333bdf0b58368817ea5116189d101c1fe9e",
"_assigned_voice_hash": "d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35",
"_speaker_id_hash": "bf4f81ea701e475a4268bb9f36ddb43d7d5c0dbf4578fd5a24c3a8b5a375b4c9",
}

def test_get_modified_utterances_with_field_hashes(self):
utterances = [self._get_utterance()]
dubbing = Utterance(
target_language="cat",
output_directory=None,
)

modified = dubbing.get_modified_utterances(utterances)
assert 0 == len(modified)

def test_get_modified_utterance_fields_none(self):
utterance = self._get_utterance()
u = Utterance(target_language="cat", output_directory="")
fields = u.get_modified_utterance_fields(utterance)
assert len(fields) == 0

def test_get_modified_utterance_fields_speaker_id(self):
utterance = self._get_utterance()
utterance["speaker_id"] = "SPEAKER_01"
u = Utterance(target_language="cat", output_directory="")
fields = u.get_modified_utterance_fields(utterance)
assert len(fields) == 1
assert "speaker_id" == fields[0]

0 comments on commit 510f4b6

Please sign in to comment.