Removed language selection and modularized code structure with issue #4

joshsoftware · Jul 3, 2024 · af2b19a · af2b19a
1 parent 19a8174
commit af2b19a
Show file tree

Hide file tree

Showing 9 changed files with 195 additions and 181 deletions.
diff --git a/app.py b/app.py
@@ -349,147 +349,67 @@
 import os
 import numpy as np
 import streamlit as st
-import torch
-import whisper
 import librosa
 import io
 import openai
-from pytube import YouTube
 
+# Import configurations and functions from modules
+from config import openai_api_key, model_id, model_path
+from load_model import load_model
+from transcribe_audio import transcribe_audio
+from extract_entities import extract_entities
+from translate_text import translate_text
+
+# Load environment variables
 load_dotenv()
 
-# Set your OpenAI API key
-openai.api_key = os.getenv("OPENAI_KEY")
-
-# Function to extract entities using OpenAI API
-def extract_entities(text):
-    prompt = f"""
-    The following entities are present in Indian Languages.
-    Please extract the following entities from the text.
-    Provide entities for both in English and the original language in a structured format:
-
-    Text: "{text}"
-
-    - Name:
-    - Phone Numbers:
-    - Addresses:
-    - Email:
-    """
-    response = openai.chat.completions.create(
-        model="gpt-3.5-turbo",
-        messages=[
-            {"role": "system", "content": "You are a helpful assistant that extracts information from Indian multilingual text."},
-            {"role": "user", "content": prompt}
-        ],
-        max_tokens=200
-    )
-    entities_text = response.choices[0].message.content
-
-    return entities_text
-
-# Function to translate text from Indian languages to English using OpenAI GPT-3.5-turbo
-def translate_text(text, source_language):
-    prompt = f"Translate the following text from {source_language} to English:\n\n{text}"
-    response = openai.chat.completions.create(
-        model="gpt-3.5-turbo",
-        messages=[
-            {"role": "system", "content": "You are a helpful assistant that translates text from Indian languages to English."},
-            {"role": "user", "content": prompt}
-        ],
-        max_tokens=150
-    )
-    translated_text = response.choices[0].message.content 
-
-    return translated_text
-
-# Use the cache decorator from Streamlit
-@st.cache(allow_output_mutation=True)
-def load_model(model_id, model_path):
-    # Define available device (CPU/GPU)
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-
-    # Load model on available device
-    model = whisper.load_model(model_id, device=device, download_root=model_path)
-
-    # Display model's parameters in the app's logs
-    print(
-        f"Model will be run on {device}\n"
-        f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
-        f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters."
-    )
-
-    return model
-
-def download_audio_from_youtube(url):
-    yt = YouTube(url)
-    audio_stream = yt.streams.filter(only_audio=True).first()
-    audio_file = audio_stream.download(filename='audio.mp4')
-    return audio_file
+# Set OpenAI API key
+openai.api_key = openai_api_key
 
+# Main function to run the Streamlit app
 def main():
-    # Display title                
-    st.title("Whisper - Speech to Text App")
-
-    # Set up environment variables for model ID and path
-    model_id = os.environ.get('MODEL_ID', 'small')  # Use a smaller model for quicker transcription by default
-    model_path = os.environ.get('MODEL_PATH', 'whisper_model')  # Default path if not set
+    st.title("Speech to Text App")
 
+    # Load the Whisper model
     model = load_model(model_id, model_path)
 
-    # Add a selectbox for language selection
-    languages = ['hi', 'bn', 'te', 'mr', 'ta', 'ur', 'gu', 'kn', 'ml', 'pa']  # List of Indian language codes
-    language = st.selectbox("Select the language of the audio file", languages, index=0)
+    # Language selection dropdown
+    #languages = ['hi', 'bn', 'te', 'mr', 'ta', 'ur', 'gu', 'kn', 'ml', 'pa']
+    #language = st.selectbox("Select the language of the audio file", languages, index=0)
 
-    # Option to upload audio file or provide YouTube link
-    st.write("Upload an audio file or provide a YouTube link:")
-    audio_file = st.file_uploader("Upload an audio file", type=["mp3", "wav"])
-    youtube_link = st.text_input("Or enter a YouTube link")
-
-    transcript = {"text": "The audio file could not be transcribed :("}
-    options = dict(beam_size=5, best_of=5, language=language)
-    transcribe_options = dict(task="transcribe", **options)
+    # File uploader for audio files
+    st.write("Upload an audio file:")
+    audio_file = st.file_uploader("Select an audio",type=["mp3", "wav"])
 
     audio_data = None
 
     if audio_file:
-        # Read file content
+        # Process uploaded audio file
+        st.write("We are extracting these entities:\n- Name:\n- Phone Numbers:\n- Addresses:\n- Email:\n- PIN Code:\n- Occupation:\n- Gender:")
         audio_bytes = audio_file.read()
         st.audio(audio_bytes)
-
-        # Convert bytes to a file-like object using io.BytesIO
         audio_file = io.BytesIO(audio_bytes)
-
-        # Convert to numpy array
-        audio_data, _ = librosa.load(audio_file, sr=16000)  # Load with target sample rate of 16000 for Whisper
-
-    elif youtube_link:
         try:
-            audio_file = download_audio_from_youtube(youtube_link)
-            st.audio(audio_file)
-
-            # Load audio file using librosa
             audio_data, _ = librosa.load(audio_file, sr=16000)
         except Exception as e:
-            st.error(f"Error downloading audio from YouTube: {e}")
+            st.error(f"Error loading audio file: {e}")
 
-    # Transcribe audio on button click      
+    # Perform transcription and other tasks on button click
     if audio_data is not None and st.button("Transcribe"):
         with st.spinner("Transcribing audio..."):
-            transcript = model.transcribe(audio_data, **transcribe_options)
-            transcription_text = transcript["text"]
+            transcription_text = transcribe_audio(model, audio_data)
             st.write(transcription_text)
 
-            # Extract entities from the transcription text
             with st.spinner("Extracting entities..."):
                 entities = extract_entities(transcription_text)
                 st.write("Extracted Entities:")
                 st.write(entities)
 
-                # Translate transcription to English
                 with st.spinner("Translating to English..."):
-                    translated_text = translate_text(transcription_text, language)
+                    translated_text = translate_text(transcription_text)
                     st.write("Translated Text:")
                     st.write(translated_text)
 
+# Entry point of the script
 if __name__ == "__main__":
     main()
diff --git a/config.py b/config.py
@@ -0,0 +1,8 @@
+import os
+from dotenv import load_dotenv
+
+load_dotenv()
+
+openai_api_key = os.getenv("OPENAI_KEY")
+model_id = os.getenv('MODEL_ID', 'large-v3')
+model_path = os.getenv('MODEL_PATH', 'whisper_model')
diff --git a/download_whisper.py b/download_whisper.py
@@ -3,8 +3,7 @@
 
 model_path = "whisper_model"
 model_id = 'large-v3'
-
-# Ensure the directory exists
+s
 os.makedirs(model_path, exist_ok=True)
 
 # Download model 

diff --git a/extract_entities.py b/extract_entities.py
@@ -0,0 +1,32 @@
+import openai
+from config import openai_api_key
+
+openai.api_key = openai_api_key
+
+def extract_entities(text):
+    prompt = f"""
+    The following entities are present in Indian Languages.
+    Please extract the following entities from the text.
+    Provide entities for both in English and the original language of the audio in well-structured format:
+
+    Text: "{text}"
+
+    - Name:
+    - Phone Numbers:
+    - Addresses:
+    - Email:
+    - PIN Code:
+    - Occupation:
+    - Gender:
+    """
+    response = openai.chat.completions.create(
+        model="gpt-3.5-turbo",
+        messages=[
+            {"role": "system", "content": "You are a helpful assistant that extracts information from Indian multilingual text."},
+            {"role": "user", "content": prompt}
+        ],
+        max_tokens=200
+    )
+    entities_text = response.choices[0].message.content
+
+    return entities_text
diff --git a/load_model.py b/load_model.py
@@ -0,0 +1,15 @@
+import torch
+import whisper
+import numpy as np
+import streamlit as st
+
+@st.cache(allow_output_mutation=True)
+def load_model(model_id, model_path):
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model = whisper.load_model(model_id, device=device, download_root=model_path)
+    print(
+        f"Model will be run on {device}\n"
+        f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
+        f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters."
+    )
+    return model
diff --git a/requirements.txt b/requirements.txt
@@ -2,3 +2,4 @@ openai-whisper
 streamlit
 librosa
 pytest
+pytube
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,3 +2,4 @@ openai-whisper @@
     streamlit
     librosa
     pytest
+    pytube