diff --git a/open_dubbing/speech_to_text.py b/open_dubbing/speech_to_text.py index 5c1d90d..fb3c034 100644 --- a/open_dubbing/speech_to_text.py +++ b/open_dubbing/speech_to_text.py @@ -14,6 +14,7 @@ import array import logging +import re from abc import ABC, abstractmethod from typing import Mapping, Sequence @@ -71,6 +72,16 @@ def _transcribe( ) -> str: pass + # Whisper sometimes includes spaces at the begining of sentences or multiple spaces between words + def _make_sure_single_space(self, sentence: str) -> str: + fixed = re.sub(r"\s{2,}", " ", sentence) + if sentence != fixed: + logging.info(f" _make_sure_single_space: {sentence} - original") + logging.info(f" _make_sure_single_space: {fixed} - fixed") + + fixed = fixed.strip() + return fixed + def transcribe_audio_chunks( self, *, @@ -99,9 +110,7 @@ def transcribe_audio_chunks( vocals_filepath=path, source_language_iso_639_1=iso_639_1, ) - transcribed_text = ( - transcribed_text.strip() - ) # Whisper sometimes includes spaces at the begining of sentences + transcribed_text = self._make_sure_single_space(transcribed_text) except Exception as e: logging.error( f"speech_to_text.transcribe_audio_chunks. file '{path}', error: '{e}'" diff --git a/sc.sh b/sc.sh index 9156cf9..9fc2bc5 100755 --- a/sc.sh +++ b/sc.sh @@ -6,7 +6,7 @@ branch_name=$(git rev-parse --abbrev-ref HEAD) declare -a target_languages=("cat") # Catalan (cat) and French (fra) declare -a inputs=($(find ../dubbing/od-videos/ -type f -name "*.mp4")) -declare -a inputs=("videos/jordi.mp4" ) +declare -a inputs=("videos/jobinterview.mp4" ) for input_file in "${inputs[@]}"; do output_directory="output/$(basename "${input_file%.*}").${branch_name}/" diff --git a/tests/speech_to_text_test.py b/tests/speech_to_text_test.py index 0c7a752..a0a6bc2 100644 --- a/tests/speech_to_text_test.py +++ b/tests/speech_to_text_test.py @@ -186,3 +186,17 @@ def test_get_unique_speakers_largest_audio(self): ) assert [("SPEAKER_01", "chunk_114.mp3")] == result + + # Assuming the SpeechToTextFasterWhisper class is already imported + @pytest.mark.parametrize( + "input_text, expected_output", + [ + ("Hello my friends", "Hello my friends"), # Case with two spaces + ("Hello my friends", "Hello my friends"), + ("Hello my friends", "Hello my friends"), + (" Hello my friends ", "Hello my friends"), + ], + ) + def test_make_sure_single_space(self, input_text, expected_output): + result = SpeechToTextFasterWhisper()._make_sure_single_space(input_text) + assert result == expected_output